src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001-2011 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   4      2005, 2006, 2007, 2008, 2009, 2010, 2011
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software: you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation, either version 3 of the License, or
  16 (at your option) any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (struct coding_system *coding,
 157                    struct coding_detection_info *detect_info)
 158 {
 159   const unsigned char *src = coding->source;
 160   const unsigned char *src_end = coding->source + coding->src_bytes;
 161   int multibytep = coding->src_multibyte;
 162   int consumed_chars = 0;
 163   int found = 0;
 164   ...;
 165
 166   while (1)
 167     {
 168       /* Get one byte from the source.  If the source is exhausted, jump
 169          to no_more_source:.  */
 170       ONE_MORE_BYTE (c);
 171
 172       if (! __C_conforms_to_XXX___ (c))
 173         break;
 174       if (! __C_strongly_suggests_XXX__ (c))
 175         found = CATEGORY_MASK_XXX;
 176     }
 177   /* The byte sequence is invalid for XXX.  */
 178   detect_info->rejected |= CATEGORY_MASK_XXX;
 179   return 0;
 180
 181  no_more_source:
 182   /* The source exhausted successfully.  */
 183   detect_info->found |= found;
 184   return 1;
 185 }
 186 #endif
 187
 188 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 189
 190   These functions decode a byte sequence specified as a source by
 191   CODING.  The resulting multibyte text goes to a place pointed to by
 192   CODING->charbuf, the length of which should not exceed
 193   CODING->charbuf_size;
 194
 195   These functions set the information of original and decoded texts in
 196   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 197   They also set CODING->result to one of CODING_RESULT_XXX indicating
 198   how the decoding is finished.
 199
 200   Below is the template of these functions.  */
 201
 202 #if 0
 203 static void
 204 decode_coding_XXXX (struct coding_system *coding)
 205 {
 206   const unsigned char *src = coding->source + coding->consumed;
 207   const unsigned char *src_end = coding->source + coding->src_bytes;
 208   /* SRC_BASE remembers the start position in source in each loop.
 209      The loop will be exited when there's not enough source code, or
 210      when there's no room in CHARBUF for a decoded character.  */
 211   const unsigned char *src_base;
 212   /* A buffer to produce decoded characters.  */
 213   int *charbuf = coding->charbuf + coding->charbuf_used;
 214   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 215   int multibytep = coding->src_multibyte;
 216
 217   while (1)
 218     {
 219       src_base = src;
 220       if (charbuf < charbuf_end)
 221         /* No more room to produce a decoded character.  */
 222         break;
 223       ONE_MORE_BYTE (c);
 224       /* Decode it. */
 225     }
 226
 227  no_more_source:
 228   if (src_base < src_end
 229       && coding->mode & CODING_MODE_LAST_BLOCK)
 230     /* If the source ends by partial bytes to construct a character,
 231        treat them as eight-bit raw data.  */
 232     while (src_base < src_end && charbuf < charbuf_end)
 233       *charbuf++ = *src_base++;
 234   /* Remember how many bytes and characters we consumed.  If the
 235      source is multibyte, the bytes and chars are not identical.  */
 236   coding->consumed = coding->consumed_char = src_base - coding->source;
 237   /* Remember how many characters we produced.  */
 238   coding->charbuf_used = charbuf - coding->charbuf;
 239 }
 240 #endif
 241
 242 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 243
 244   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 245   internal multibyte format by CODING.  The resulting byte sequence
 246   goes to a place pointed to by DESTINATION, the length of which
 247   should not exceed DST_BYTES.
 248
 249   These functions set the information of original and encoded texts in
 250   the members produced, produced_char, consumed, and consumed_char of
 251   the structure *CODING.  They also set the member result to one of
 252   CODING_RESULT_XXX indicating how the encoding finished.
 253
 254   DST_BYTES zero means that source area and destination area are
 255   overlapped, which means that we can produce a encoded text until it
 256   reaches at the head of not-yet-encoded source text.
 257
 258   Below is a template of these functions.  */
 259 #if 0
 260 static void
 261 encode_coding_XXX (struct coding_system *coding)
 262 {
 263   int multibytep = coding->dst_multibyte;
 264   int *charbuf = coding->charbuf;
 265   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 266   unsigned char *dst = coding->destination + coding->produced;
 267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 268   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 269   int produced_chars = 0;
 270
 271   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 272     {
 273       int c = *charbuf;
 274       /* Encode C into DST, and increment DST.  */
 275     }
 276  label_no_more_destination:
 277   /* How many chars and bytes we produced.  */
 278   coding->produced_char += produced_chars;
 279   coding->produced = dst - coding->destination;
 280 }
 281 #endif
 282
 283 \f
 284 /*** 1. Preamble ***/
 285
 286 #include <config.h>
 287 #include <stdio.h>
 288 #include <setjmp.h>
 289
 290 #include "lisp.h"
 291 #include "buffer.h"
 292 #include "character.h"
 293 #include "charset.h"
 294 #include "ccl.h"
 295 #include "composite.h"
 296 #include "coding.h"
 297 #include "window.h"
 298 #include "frame.h"
 299 #include "termhooks.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 Lisp_Object Qbuffer_file_coding_system;
 306 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 307 Lisp_Object Qdefault_char;
 308 Lisp_Object Qno_conversion, Qundecided;
 309 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 310 Lisp_Object Qbig, Qlittle;
 311 Lisp_Object Qcoding_system_history;
 312 Lisp_Object Qvalid_codes;
 313 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 314 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 315 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 316 Lisp_Object QCascii_compatible_p;
 317
 318 Lisp_Object Qcall_process, Qcall_process_region;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 323 Lisp_Object Qinterrupted, Qinsufficient_memory;
 324
 325 /* If a symbol has this property, evaluate the value to define the
 326    symbol as a coding system.  */
 327 static Lisp_Object Qcoding_system_define_form;
 328
 329 /* Format of end-of-line decided by system.  This is Qunix on
 330    Unix and Mac, Qdos on DOS/Windows.
 331    This has an effect only for external encoding (i.e. for output to
 332    file and process), not for in-buffer or Lisp string encoding.  */
 333 static Lisp_Object system_eol_type;
 334
 335 #ifdef emacs
 336
 337 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 338
 339 /* Coding system emacs-mule and raw-text are for converting only
 340    end-of-line format.  */
 341 Lisp_Object Qemacs_mule, Qraw_text;
 342 Lisp_Object Qutf_8_emacs;
 343
 344 /* Coding-systems are handed between Emacs Lisp programs and C internal
 345    routines by the following three variables.  */
 346 /* Coding system to be used to encode text for terminal display when
 347    terminal coding system is nil.  */
 348 struct coding_system safe_terminal_coding;
 349
 350 #endif /* emacs */
 351
 352 Lisp_Object Qtranslation_table;
 353 Lisp_Object Qtranslation_table_id;
 354 Lisp_Object Qtranslation_table_for_decode;
 355 Lisp_Object Qtranslation_table_for_encode;
 356
 357 /* Two special coding systems.  */
 358 Lisp_Object Vsjis_coding_system;
 359 Lisp_Object Vbig5_coding_system;
 360
 361 /* ISO2022 section */
 362
 363 #define CODING_ISO_INITIAL(coding, reg)                 \
 364   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 365                      coding_attr_iso_initial),          \
 366                reg)))
 367
 368
 369 #define CODING_ISO_REQUEST(coding, charset_id)          \
 370   (((charset_id) <= (coding)->max_charset_id            \
 371     ? ((coding)->safe_charsets[charset_id] != 255       \
 372        ? (coding)->safe_charsets[charset_id]            \
 373        : -1)                                            \
 374     : -1))
 375
 376
 377 #define CODING_ISO_FLAGS(coding)        \
 378   ((coding)->spec.iso_2022.flags)
 379 #define CODING_ISO_DESIGNATION(coding, reg)     \
 380   ((coding)->spec.iso_2022.current_designation[reg])
 381 #define CODING_ISO_INVOCATION(coding, plane)    \
 382   ((coding)->spec.iso_2022.current_invocation[plane])
 383 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 384   ((coding)->spec.iso_2022.single_shifting)
 385 #define CODING_ISO_BOL(coding)  \
 386   ((coding)->spec.iso_2022.bol)
 387 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 388   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 389 #define CODING_ISO_CMP_STATUS(coding)   \
 390   (&(coding)->spec.iso_2022.cmp_status)
 391 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 392   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 393 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 394   ((coding)->spec.iso_2022.embedded_utf_8)
 395
 396 /* Control characters of ISO2022.  */
 397                         /* code */      /* function */
 398 #define ISO_CODE_LF     0x0A            /* line-feed */
 399 #define ISO_CODE_CR     0x0D            /* carriage-return */
 400 #define ISO_CODE_SO     0x0E            /* shift-out */
 401 #define ISO_CODE_SI     0x0F            /* shift-in */
 402 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 403 #define ISO_CODE_ESC    0x1B            /* escape */
 404 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 405 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 406 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 407
 408 /* All code (1-byte) of ISO2022 is classified into one of the
 409    followings.  */
 410 enum iso_code_class_type
 411   {
 412     ISO_control_0,              /* Control codes in the range
 413                                    0x00..0x1F and 0x7F, except for the
 414                                    following 5 codes.  */
 415     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 416     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 417     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 418     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 419     ISO_control_1,              /* Control codes in the range
 420                                    0x80..0x9F, except for the
 421                                    following 3 codes.  */
 422     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 423     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 424     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 425     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 426     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 427     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 428     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 429   };
 430
 431 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 432     `iso-flags' attribute of an iso2022 coding system.  */
 433
 434 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 435    instead of the correct short-form sequence (e.g. ESC $ A).  */
 436 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 437
 438 /* If set, reset graphic planes and registers at end-of-line to the
 439    initial state.  */
 440 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 441
 442 /* If set, reset graphic planes and registers before any control
 443    characters to the initial state.  */
 444 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 445
 446 /* If set, encode by 7-bit environment.  */
 447 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 448
 449 /* If set, use locking-shift function.  */
 450 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 451
 452 /* If set, use single-shift function.  Overwrite
 453    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 454 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 455
 456 /* If set, use designation escape sequence.  */
 457 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 458
 459 /* If set, produce revision number sequence.  */
 460 #define CODING_ISO_FLAG_REVISION        0x0080
 461
 462 /* If set, produce ISO6429's direction specifying sequence.  */
 463 #define CODING_ISO_FLAG_DIRECTION       0x0100
 464
 465 /* If set, assume designation states are reset at beginning of line on
 466    output.  */
 467 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 468
 469 /* If set, designation sequence should be placed at beginning of line
 470    on output.  */
 471 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 472
 473 /* If set, do not encode unsafe characters on output.  */
 474 #define CODING_ISO_FLAG_SAFE            0x0800
 475
 476 /* If set, extra latin codes (128..159) are accepted as a valid code
 477    on input.  */
 478 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 479
 480 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 481
 482 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 483
 484 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 485
 486 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 487
 488 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 489
 490 /* A character to be produced on output if encoding of the original
 491    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 492 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 493
 494 /* UTF-8 section */
 495 #define CODING_UTF_8_BOM(coding)        \
 496   ((coding)->spec.utf_8_bom)
 497
 498 /* UTF-16 section */
 499 #define CODING_UTF_16_BOM(coding)       \
 500   ((coding)->spec.utf_16.bom)
 501
 502 #define CODING_UTF_16_ENDIAN(coding)    \
 503   ((coding)->spec.utf_16.endian)
 504
 505 #define CODING_UTF_16_SURROGATE(coding) \
 506   ((coding)->spec.utf_16.surrogate)
 507
 508
 509 /* CCL section */
 510 #define CODING_CCL_DECODER(coding)      \
 511   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 512 #define CODING_CCL_ENCODER(coding)      \
 513   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 514 #define CODING_CCL_VALIDS(coding)                                          \
 515   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 516
 517 /* Index for each coding category in `coding_categories' */
 518
 519 enum coding_category
 520   {
 521     coding_category_iso_7,
 522     coding_category_iso_7_tight,
 523     coding_category_iso_8_1,
 524     coding_category_iso_8_2,
 525     coding_category_iso_7_else,
 526     coding_category_iso_8_else,
 527     coding_category_utf_8_auto,
 528     coding_category_utf_8_nosig,
 529     coding_category_utf_8_sig,
 530     coding_category_utf_16_auto,
 531     coding_category_utf_16_be,
 532     coding_category_utf_16_le,
 533     coding_category_utf_16_be_nosig,
 534     coding_category_utf_16_le_nosig,
 535     coding_category_charset,
 536     coding_category_sjis,
 537     coding_category_big5,
 538     coding_category_ccl,
 539     coding_category_emacs_mule,
 540     /* All above are targets of code detection.  */
 541     coding_category_raw_text,
 542     coding_category_undecided,
 543     coding_category_max
 544   };
 545
 546 /* Definitions of flag bits used in detect_coding_XXXX.  */
 547 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 548 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 549 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 550 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 551 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 552 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 553 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 554 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 555 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 556 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 557 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 558 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 559 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 560 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 561 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 562 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 563 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 564 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 565 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 566 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 567
 568 /* This value is returned if detect_coding_mask () find nothing other
 569    than ASCII characters.  */
 570 #define CATEGORY_MASK_ANY               \
 571   (CATEGORY_MASK_ISO_7                  \
 572    | CATEGORY_MASK_ISO_7_TIGHT          \
 573    | CATEGORY_MASK_ISO_8_1              \
 574    | CATEGORY_MASK_ISO_8_2              \
 575    | CATEGORY_MASK_ISO_7_ELSE           \
 576    | CATEGORY_MASK_ISO_8_ELSE           \
 577    | CATEGORY_MASK_UTF_8_AUTO           \
 578    | CATEGORY_MASK_UTF_8_NOSIG          \
 579    | CATEGORY_MASK_UTF_8_SIG            \
 580    | CATEGORY_MASK_UTF_16_AUTO          \
 581    | CATEGORY_MASK_UTF_16_BE            \
 582    | CATEGORY_MASK_UTF_16_LE            \
 583    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 584    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 585    | CATEGORY_MASK_CHARSET              \
 586    | CATEGORY_MASK_SJIS                 \
 587    | CATEGORY_MASK_BIG5                 \
 588    | CATEGORY_MASK_CCL                  \
 589    | CATEGORY_MASK_EMACS_MULE)
 590
 591
 592 #define CATEGORY_MASK_ISO_7BIT \
 593   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 594
 595 #define CATEGORY_MASK_ISO_8BIT \
 596   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 597
 598 #define CATEGORY_MASK_ISO_ELSE \
 599   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 600
 601 #define CATEGORY_MASK_ISO_ESCAPE        \
 602   (CATEGORY_MASK_ISO_7                  \
 603    | CATEGORY_MASK_ISO_7_TIGHT          \
 604    | CATEGORY_MASK_ISO_7_ELSE           \
 605    | CATEGORY_MASK_ISO_8_ELSE)
 606
 607 #define CATEGORY_MASK_ISO       \
 608   (  CATEGORY_MASK_ISO_7BIT     \
 609      | CATEGORY_MASK_ISO_8BIT   \
 610      | CATEGORY_MASK_ISO_ELSE)
 611
 612 #define CATEGORY_MASK_UTF_16            \
 613   (CATEGORY_MASK_UTF_16_AUTO            \
 614    | CATEGORY_MASK_UTF_16_BE            \
 615    | CATEGORY_MASK_UTF_16_LE            \
 616    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 617    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 618
 619 #define CATEGORY_MASK_UTF_8     \
 620   (CATEGORY_MASK_UTF_8_AUTO     \
 621    | CATEGORY_MASK_UTF_8_NOSIG  \
 622    | CATEGORY_MASK_UTF_8_SIG)
 623
 624 /* Table of coding categories (Lisp symbols).  This variable is for
 625    internal use only.  */
 626 static Lisp_Object Vcoding_category_table;
 627
 628 /* Table of coding-categories ordered by priority.  */
 629 static enum coding_category coding_priorities[coding_category_max];
 630
 631 /* Nth element is a coding context for the coding system bound to the
 632    Nth coding category.  */
 633 static struct coding_system coding_categories[coding_category_max];
 634
 635 /*** Commonly used macros and functions ***/
 636
 637 #ifndef min
 638 #define min(a, b) ((a) < (b) ? (a) : (b))
 639 #endif
 640 #ifndef max
 641 #define max(a, b) ((a) > (b) ? (a) : (b))
 642 #endif
 643
 644 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 645   do {                                                  \
 646     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 647     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 648   } while (0)
 649
 650
 651 /* Safely get one byte from the source text pointed by SRC which ends
 652    at SRC_END, and set C to that byte.  If there are not enough bytes
 653    in the source, it jumps to `no_more_source'.  If multibytep is
 654    nonzero, and a multibyte character is found at SRC, set C to the
 655    negative value of the character code.  The caller should declare
 656    and set these variables appropriately in advance:
 657         src, src_end, multibytep */
 658
 659 #define ONE_MORE_BYTE(c)                                \
 660   do {                                                  \
 661     if (src == src_end)                                 \
 662       {                                                 \
 663         if (src_base < src)                             \
 664           record_conversion_result                      \
 665             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 666         goto no_more_source;                            \
 667       }                                                 \
 668     c = *src++;                                         \
 669     if (multibytep && (c & 0x80))                       \
 670       {                                                 \
 671         if ((c & 0xFE) == 0xC0)                         \
 672           c = ((c & 1) << 6) | *src++;                  \
 673         else                                            \
 674           {                                             \
 675             src--;                                      \
 676             c = - string_char (src, &src, NULL);        \
 677             record_conversion_result                    \
 678               (coding, CODING_RESULT_INVALID_SRC);      \
 679           }                                             \
 680       }                                                 \
 681     consumed_chars++;                                   \
 682   } while (0)
 683
 684 /* Safely get two bytes from the source text pointed by SRC which ends
 685    at SRC_END, and set C1 and C2 to those bytes while skipping the
 686    heading multibyte characters.  If there are not enough bytes in the
 687    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 688    a multibyte character is found for C2, set C2 to the negative value
 689    of the character code.  The caller should declare and set these
 690    variables appropriately in advance:
 691         src, src_end, multibytep
 692    It is intended that this macro is used in detect_coding_utf_16.  */
 693
 694 #define TWO_MORE_BYTES(c1, c2)                          \
 695   do {                                                  \
 696     do {                                                \
 697       if (src == src_end)                               \
 698         goto no_more_source;                            \
 699       c1 = *src++;                                      \
 700       if (multibytep && (c1 & 0x80))                    \
 701         {                                               \
 702           if ((c1 & 0xFE) == 0xC0)                      \
 703             c1 = ((c1 & 1) << 6) | *src++;              \
 704           else                                          \
 705             {                                           \
 706               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 707               c1 = -1;                                  \
 708             }                                           \
 709         }                                               \
 710     } while (c1 < 0);                                   \
 711     if (src == src_end)                                 \
 712       goto no_more_source;                              \
 713     c2 = *src++;                                        \
 714     if (multibytep && (c2 & 0x80))                      \
 715       {                                                 \
 716         if ((c2 & 0xFE) == 0xC0)                        \
 717           c2 = ((c2 & 1) << 6) | *src++;                \
 718         else                                            \
 719           c2 = -1;                                      \
 720       }                                                 \
 721   } while (0)
 722
 723
 724 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 725   do {                                                  \
 726     c = *src++;                                         \
 727     if (multibytep && (c & 0x80))                       \
 728       {                                                 \
 729         if ((c & 0xFE) == 0xC0)                         \
 730           c = ((c & 1) << 6) | *src++;                  \
 731         else                                            \
 732           {                                             \
 733             src--;                                      \
 734             c = - string_char (src, &src, NULL);        \
 735             record_conversion_result                    \
 736               (coding, CODING_RESULT_INVALID_SRC);      \
 737           }                                             \
 738       }                                                 \
 739     consumed_chars++;                                   \
 740   } while (0)
 741
 742
 743 /* Store a byte C in the place pointed by DST and increment DST to the
 744    next free point, and increment PRODUCED_CHARS.  The caller should
 745    assure that C is 0..127, and declare and set the variable `dst'
 746    appropriately in advance.
 747 */
 748
 749
 750 #define EMIT_ONE_ASCII_BYTE(c)  \
 751   do {                          \
 752     produced_chars++;           \
 753     *dst++ = (c);               \
 754   } while (0)
 755
 756
 757 /* Like EMIT_ONE_ASCII_BYTE but store two bytes; C1 and C2.  */
 758
 759 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 760   do {                                  \
 761     produced_chars += 2;                \
 762     *dst++ = (c1), *dst++ = (c2);       \
 763   } while (0)
 764
 765
 766 /* Store a byte C in the place pointed by DST and increment DST to the
 767    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 768    nonzero, store in an appropriate multibyte from.  The caller should
 769    declare and set the variables `dst' and `multibytep' appropriately
 770    in advance.  */
 771
 772 #define EMIT_ONE_BYTE(c)                \
 773   do {                                  \
 774     produced_chars++;                   \
 775     if (multibytep)                     \
 776       {                                 \
 777         int ch = (c);                   \
 778         if (ch >= 0x80)                 \
 779           ch = BYTE8_TO_CHAR (ch);      \
 780         CHAR_STRING_ADVANCE (ch, dst);  \
 781       }                                 \
 782     else                                \
 783       *dst++ = (c);                     \
 784   } while (0)
 785
 786
 787 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 788
 789 #define EMIT_TWO_BYTES(c1, c2)          \
 790   do {                                  \
 791     produced_chars += 2;                \
 792     if (multibytep)                     \
 793       {                                 \
 794         int ch;                         \
 795                                         \
 796         ch = (c1);                      \
 797         if (ch >= 0x80)                 \
 798           ch = BYTE8_TO_CHAR (ch);      \
 799         CHAR_STRING_ADVANCE (ch, dst);  \
 800         ch = (c2);                      \
 801         if (ch >= 0x80)                 \
 802           ch = BYTE8_TO_CHAR (ch);      \
 803         CHAR_STRING_ADVANCE (ch, dst);  \
 804       }                                 \
 805     else                                \
 806       {                                 \
 807         *dst++ = (c1);                  \
 808         *dst++ = (c2);                  \
 809       }                                 \
 810   } while (0)
 811
 812
 813 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 814   do {                                  \
 815     EMIT_ONE_BYTE (c1);                 \
 816     EMIT_TWO_BYTES (c2, c3);            \
 817   } while (0)
 818
 819
 820 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 821   do {                                          \
 822     EMIT_TWO_BYTES (c1, c2);                    \
 823     EMIT_TWO_BYTES (c3, c4);                    \
 824   } while (0)
 825
 826
 827 /* Prototypes for static functions.  */
 828 static void record_conversion_result (struct coding_system *coding,
 829                                       enum coding_result_code result);
 830 static int detect_coding_utf_8 (struct coding_system *,
 831                                 struct coding_detection_info *info);
 832 static void decode_coding_utf_8 (struct coding_system *);
 833 static int encode_coding_utf_8 (struct coding_system *);
 834
 835 static int detect_coding_utf_16 (struct coding_system *,
 836                                  struct coding_detection_info *info);
 837 static void decode_coding_utf_16 (struct coding_system *);
 838 static int encode_coding_utf_16 (struct coding_system *);
 839
 840 static int detect_coding_iso_2022 (struct coding_system *,
 841                                    struct coding_detection_info *info);
 842 static void decode_coding_iso_2022 (struct coding_system *);
 843 static int encode_coding_iso_2022 (struct coding_system *);
 844
 845 static int detect_coding_emacs_mule (struct coding_system *,
 846                                      struct coding_detection_info *info);
 847 static void decode_coding_emacs_mule (struct coding_system *);
 848 static int encode_coding_emacs_mule (struct coding_system *);
 849
 850 static int detect_coding_sjis (struct coding_system *,
 851                                struct coding_detection_info *info);
 852 static void decode_coding_sjis (struct coding_system *);
 853 static int encode_coding_sjis (struct coding_system *);
 854
 855 static int detect_coding_big5 (struct coding_system *,
 856                                struct coding_detection_info *info);
 857 static void decode_coding_big5 (struct coding_system *);
 858 static int encode_coding_big5 (struct coding_system *);
 859
 860 static int detect_coding_ccl (struct coding_system *,
 861                               struct coding_detection_info *info);
 862 static void decode_coding_ccl (struct coding_system *);
 863 static int encode_coding_ccl (struct coding_system *);
 864
 865 static void decode_coding_raw_text (struct coding_system *);
 866 static int encode_coding_raw_text (struct coding_system *);
 867
 868 static void coding_set_source (struct coding_system *);
 869 static void coding_set_destination (struct coding_system *);
 870 static void coding_alloc_by_realloc (struct coding_system *, EMACS_INT);
 871 static void coding_alloc_by_making_gap (struct coding_system *,
 872                                         EMACS_INT, EMACS_INT);
 873 static unsigned char *alloc_destination (struct coding_system *,
 874                                          EMACS_INT, unsigned char *);
 875 static void setup_iso_safe_charsets (Lisp_Object);
 876 static unsigned char *encode_designation_at_bol (struct coding_system *,
 877                                                  int *, int *,
 878                                                  unsigned char *);
 879 static int detect_eol (const unsigned char *,
 880                        EMACS_INT, enum coding_category);
 881 static Lisp_Object adjust_coding_eol_type (struct coding_system *, int);
 882 static void decode_eol (struct coding_system *);
 883 static Lisp_Object get_translation_table (Lisp_Object, int, int *);
 884 static Lisp_Object get_translation (Lisp_Object, int *, int *);
 885 static int produce_chars (struct coding_system *, Lisp_Object, int);
 886 static INLINE void produce_charset (struct coding_system *, int *,
 887                                     EMACS_INT);
 888 static void produce_annotation (struct coding_system *, EMACS_INT);
 889 static int decode_coding (struct coding_system *);
 890 static INLINE int *handle_composition_annotation (EMACS_INT, EMACS_INT,
 891                                                   struct coding_system *,
 892                                                   int *, EMACS_INT *);
 893 static INLINE int *handle_charset_annotation (EMACS_INT, EMACS_INT,
 894                                               struct coding_system *,
 895                                               int *, EMACS_INT *);
 896 static void consume_chars (struct coding_system *, Lisp_Object, int);
 897 static int encode_coding (struct coding_system *);
 898 static Lisp_Object make_conversion_work_buffer (int);
 899 static Lisp_Object code_conversion_restore (Lisp_Object);
 900 static INLINE int char_encodable_p (int, Lisp_Object);
 901 static Lisp_Object make_subsidiaries (Lisp_Object);
 902
 903 static void
 904 record_conversion_result (struct coding_system *coding,
 905                           enum coding_result_code result)
 906 {
 907   coding->result = result;
 908   switch (result)
 909     {
 910     case CODING_RESULT_INSUFFICIENT_SRC:
 911       Vlast_code_conversion_error = Qinsufficient_source;
 912       break;
 913     case CODING_RESULT_INCONSISTENT_EOL:
 914       Vlast_code_conversion_error = Qinconsistent_eol;
 915       break;
 916     case CODING_RESULT_INVALID_SRC:
 917       Vlast_code_conversion_error = Qinvalid_source;
 918       break;
 919     case CODING_RESULT_INTERRUPT:
 920       Vlast_code_conversion_error = Qinterrupted;
 921       break;
 922     case CODING_RESULT_INSUFFICIENT_MEM:
 923       Vlast_code_conversion_error = Qinsufficient_memory;
 924       break;
 925     case CODING_RESULT_INSUFFICIENT_DST:
 926       /* Don't record this error in Vlast_code_conversion_error
 927          because it happens just temporarily and is resolved when the
 928          whole conversion is finished.  */
 929       break;
 930     case CODING_RESULT_SUCCESS:
 931       break;
 932     default:
 933       Vlast_code_conversion_error = intern ("Unknown error");
 934     }
 935 }
 936
 937 /* This wrapper macro is used to preserve validity of pointers into
 938    buffer text across calls to decode_char, which could cause
 939    relocation of buffers if it loads a charset map, because loading a
 940    charset map allocates large structures.  */
 941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 942   do {                                                                       \
 943     charset_map_loaded = 0;                                                  \
 944     c = DECODE_CHAR (charset, code);                                         \
 945     if (charset_map_loaded)                                                  \
 946       {                                                                      \
 947         const unsigned char *orig = coding->source;                          \
 948         EMACS_INT offset;                                                    \
 949                                                                              \
 950         coding_set_source (coding);                                          \
 951         offset = coding->source - orig;                                      \
 952         src += offset;                                                       \
 953         src_base += offset;                                                  \
 954         src_end += offset;                                                   \
 955       }                                                                      \
 956   } while (0)
 957
 958
 959 /* If there are at least BYTES length of room at dst, allocate memory
 960    for coding->destination and update dst and dst_end.  We don't have
 961    to take care of coding->source which will be relocated.  It is
 962    handled by calling coding_set_source in encode_coding.  */
 963
 964 #define ASSURE_DESTINATION(bytes)                               \
 965   do {                                                          \
 966     if (dst + (bytes) >= dst_end)                               \
 967       {                                                         \
 968         int more_bytes = charbuf_end - charbuf + (bytes);       \
 969                                                                 \
 970         dst = alloc_destination (coding, more_bytes, dst);      \
 971         dst_end = coding->destination + coding->dst_bytes;      \
 972       }                                                         \
 973   } while (0)
 974
 975
 976 /* Store multibyte form of the character C in P, and advance P to the
 977    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
 978    never calls MAYBE_UNIFY_CHAR.  */
 979
 980 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
 981   do {                                          \
 982     if ((c) <= MAX_1_BYTE_CHAR)                 \
 983       *(p)++ = (c);                             \
 984     else if ((c) <= MAX_2_BYTE_CHAR)            \
 985       *(p)++ = (0xC0 | ((c) >> 6)),             \
 986         *(p)++ = (0x80 | ((c) & 0x3F));         \
 987     else if ((c) <= MAX_3_BYTE_CHAR)            \
 988       *(p)++ = (0xE0 | ((c) >> 12)),            \
 989         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
 990         *(p)++ = (0x80 | ((c) & 0x3F));         \
 991     else if ((c) <= MAX_4_BYTE_CHAR)            \
 992       *(p)++ = (0xF0 | (c >> 18)),              \
 993         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
 994         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
 995         *(p)++ = (0x80 | (c & 0x3F));           \
 996     else if ((c) <= MAX_5_BYTE_CHAR)            \
 997       *(p)++ = 0xF8,                            \
 998         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
 999         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1000         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1001         *(p)++ = (0x80 | (c & 0x3F));           \
1002     else                                        \
1003       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1004   } while (0)
1005
1006
1007 /* Return the character code of character whose multibyte form is at
1008    P, and advance P to the end of the multibyte form.  This is like
1009    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1010
1011 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1012   (!((p)[0] & 0x80)                                             \
1013    ? *(p)++                                                     \
1014    : ! ((p)[0] & 0x20)                                          \
1015    ? ((p) += 2,                                                 \
1016       ((((p)[-2] & 0x1F) << 6)                                  \
1017        | ((p)[-1] & 0x3F)                                       \
1018        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1019    : ! ((p)[0] & 0x10)                                          \
1020    ? ((p) += 3,                                                 \
1021       ((((p)[-3] & 0x0F) << 12)                                 \
1022        | (((p)[-2] & 0x3F) << 6)                                \
1023        | ((p)[-1] & 0x3F)))                                     \
1024    : ! ((p)[0] & 0x08)                                          \
1025    ? ((p) += 4,                                                 \
1026       ((((p)[-4] & 0xF) << 18)                                  \
1027        | (((p)[-3] & 0x3F) << 12)                               \
1028        | (((p)[-2] & 0x3F) << 6)                                \
1029        | ((p)[-1] & 0x3F)))                                     \
1030    : ((p) += 5,                                                 \
1031       ((((p)[-4] & 0x3F) << 18)                                 \
1032        | (((p)[-3] & 0x3F) << 12)                               \
1033        | (((p)[-2] & 0x3F) << 6)                                \
1034        | ((p)[-1] & 0x3F))))
1035
1036
1037 static void
1038 coding_set_source (struct coding_system *coding)
1039 {
1040   if (BUFFERP (coding->src_object))
1041     {
1042       struct buffer *buf = XBUFFER (coding->src_object);
1043
1044       if (coding->src_pos < 0)
1045         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1046       else
1047         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1048     }
1049   else if (STRINGP (coding->src_object))
1050     {
1051       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1052     }
1053   else
1054     {
1055       /* Otherwise, the source is C string and is never relocated
1056          automatically.  Thus we don't have to update anything.  */
1057     }
1058 }
1059
1060 static void
1061 coding_set_destination (struct coding_system *coding)
1062 {
1063   if (BUFFERP (coding->dst_object))
1064     {
1065       if (coding->src_pos < 0)
1066         {
1067           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1068           coding->dst_bytes = (GAP_END_ADDR
1069                                - (coding->src_bytes - coding->consumed)
1070                                - coding->destination);
1071         }
1072       else
1073         {
1074           /* We are sure that coding->dst_pos_byte is before the gap
1075              of the buffer. */
1076           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1077                                  + coding->dst_pos_byte - BEG_BYTE);
1078           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1079                                - coding->destination);
1080         }
1081     }
1082   else
1083     {
1084       /* Otherwise, the destination is C string and is never relocated
1085          automatically.  Thus we don't have to update anything.  */
1086     }
1087 }
1088
1089
1090 static void
1091 coding_alloc_by_realloc (struct coding_system *coding, EMACS_INT bytes)
1092 {
1093   coding->destination = (unsigned char *) xrealloc (coding->destination,
1094                                                     coding->dst_bytes + bytes);
1095   coding->dst_bytes += bytes;
1096 }
1097
1098 static void
1099 coding_alloc_by_making_gap (struct coding_system *coding,
1100                             EMACS_INT gap_head_used, EMACS_INT bytes)
1101 {
1102   if (EQ (coding->src_object, coding->dst_object))
1103     {
1104       /* The gap may contain the produced data at the head and not-yet
1105          consumed data at the tail.  To preserve those data, we at
1106          first make the gap size to zero, then increase the gap
1107          size.  */
1108       EMACS_INT add = GAP_SIZE;
1109
1110       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1111       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1112       make_gap (bytes);
1113       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1114       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1115     }
1116   else
1117     {
1118       Lisp_Object this_buffer;
1119
1120       this_buffer = Fcurrent_buffer ();
1121       set_buffer_internal (XBUFFER (coding->dst_object));
1122       make_gap (bytes);
1123       set_buffer_internal (XBUFFER (this_buffer));
1124     }
1125 }
1126
1127
1128 static unsigned char *
1129 alloc_destination (struct coding_system *coding, EMACS_INT nbytes,
1130                    unsigned char *dst)
1131 {
1132   EMACS_INT offset = dst - coding->destination;
1133
1134   if (BUFFERP (coding->dst_object))
1135     {
1136       struct buffer *buf = XBUFFER (coding->dst_object);
1137
1138       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1139     }
1140   else
1141     coding_alloc_by_realloc (coding, nbytes);
1142   coding_set_destination (coding);
1143   dst = coding->destination + offset;
1144   return dst;
1145 }
1146
1147 /** Macros for annotations.  */
1148
1149 /* An annotation data is stored in the array coding->charbuf in this
1150    format:
1151      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1152    LENGTH is the number of elements in the annotation.
1153    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1154    NCHARS is the number of characters in the text annotated.
1155
1156    The format of the following elements depend on ANNOTATION_MASK.
1157
1158    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1159    follows:
1160      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1161
1162    NBYTES is the number of bytes specified in the header part of
1163    old-style emacs-mule encoding, or 0 for the other kind of
1164    composition.
1165
1166    METHOD is one of enum composition_method.
1167
1168    Optional COMPOSITION-COMPONENTS are characters and composition
1169    rules.
1170
1171    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1172    follows.
1173
1174    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1175    recover from an invalid annotation, and should be skipped by
1176    produce_annotation.  */
1177
1178 /* Maximum length of the header of annotation data.  */
1179 #define MAX_ANNOTATION_LENGTH 5
1180
1181 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1182   do {                                                  \
1183     *(buf)++ = -(len);                                  \
1184     *(buf)++ = (mask);                                  \
1185     *(buf)++ = (nchars);                                \
1186     coding->annotated = 1;                              \
1187   } while (0);
1188
1189 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1190   do {                                                                      \
1191     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1192     *buf++ = nbytes;                                                        \
1193     *buf++ = method;                                                        \
1194   } while (0)
1195
1196
1197 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1198   do {                                                                  \
1199     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1200     *buf++ = id;                                                        \
1201   } while (0)
1202
1203 \f
1204 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1205
1206
1207
1208 \f
1209 /*** 3. UTF-8 ***/
1210
1211 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1212    Check if a text is encoded in UTF-8.  If it is, return 1, else
1213    return 0.  */
1214
1215 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1216 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1217 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1218 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1219 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1220 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1221
1222 #define UTF_BOM 0xFEFF
1223 #define UTF_8_BOM_1 0xEF
1224 #define UTF_8_BOM_2 0xBB
1225 #define UTF_8_BOM_3 0xBF
1226
1227 static int
1228 detect_coding_utf_8 (struct coding_system *coding,
1229                      struct coding_detection_info *detect_info)
1230 {
1231   const unsigned char *src = coding->source, *src_base;
1232   const unsigned char *src_end = coding->source + coding->src_bytes;
1233   int multibytep = coding->src_multibyte;
1234   int consumed_chars = 0;
1235   int bom_found = 0;
1236   int found = 0;
1237
1238   detect_info->checked |= CATEGORY_MASK_UTF_8;
1239   /* A coding system of this category is always ASCII compatible.  */
1240   src += coding->head_ascii;
1241
1242   while (1)
1243     {
1244       int c, c1, c2, c3, c4;
1245
1246       src_base = src;
1247       ONE_MORE_BYTE (c);
1248       if (c < 0 || UTF_8_1_OCTET_P (c))
1249         continue;
1250       ONE_MORE_BYTE (c1);
1251       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1252         break;
1253       if (UTF_8_2_OCTET_LEADING_P (c))
1254         {
1255           found = 1;
1256           continue;
1257         }
1258       ONE_MORE_BYTE (c2);
1259       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1260         break;
1261       if (UTF_8_3_OCTET_LEADING_P (c))
1262         {
1263           found = 1;
1264           if (src_base == coding->source
1265               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1266             bom_found = 1;
1267           continue;
1268         }
1269       ONE_MORE_BYTE (c3);
1270       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1271         break;
1272       if (UTF_8_4_OCTET_LEADING_P (c))
1273         {
1274           found = 1;
1275           continue;
1276         }
1277       ONE_MORE_BYTE (c4);
1278       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1279         break;
1280       if (UTF_8_5_OCTET_LEADING_P (c))
1281         {
1282           found = 1;
1283           continue;
1284         }
1285       break;
1286     }
1287   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1288   return 0;
1289
1290  no_more_source:
1291   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1292     {
1293       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1294       return 0;
1295     }
1296   if (bom_found)
1297     {
1298       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1299       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1300     }
1301   else
1302     {
1303       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1304       if (found)
1305         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1306     }
1307   return 1;
1308 }
1309
1310
1311 static void
1312 decode_coding_utf_8 (struct coding_system *coding)
1313 {
1314   const unsigned char *src = coding->source + coding->consumed;
1315   const unsigned char *src_end = coding->source + coding->src_bytes;
1316   const unsigned char *src_base;
1317   int *charbuf = coding->charbuf + coding->charbuf_used;
1318   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1319   int consumed_chars = 0, consumed_chars_base = 0;
1320   int multibytep = coding->src_multibyte;
1321   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1322   Lisp_Object attr, charset_list;
1323   int eol_crlf =
1324     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1325   int byte_after_cr = -1;
1326
1327   CODING_GET_INFO (coding, attr, charset_list);
1328
1329   if (bom != utf_without_bom)
1330     {
1331       int c1, c2, c3;
1332
1333       src_base = src;
1334       ONE_MORE_BYTE (c1);
1335       if (! UTF_8_3_OCTET_LEADING_P (c1))
1336         src = src_base;
1337       else
1338         {
1339           ONE_MORE_BYTE (c2);
1340           if (! UTF_8_EXTRA_OCTET_P (c2))
1341             src = src_base;
1342           else
1343             {
1344               ONE_MORE_BYTE (c3);
1345               if (! UTF_8_EXTRA_OCTET_P (c3))
1346                 src = src_base;
1347               else
1348                 {
1349                   if ((c1 != UTF_8_BOM_1)
1350                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1351                     src = src_base;
1352                   else
1353                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1354                 }
1355             }
1356         }
1357     }
1358   CODING_UTF_8_BOM (coding) = utf_without_bom;
1359
1360   while (1)
1361     {
1362       int c, c1, c2, c3, c4, c5;
1363
1364       src_base = src;
1365       consumed_chars_base = consumed_chars;
1366
1367       if (charbuf >= charbuf_end)
1368         {
1369           if (byte_after_cr >= 0)
1370             src_base--;
1371           break;
1372         }
1373
1374       if (byte_after_cr >= 0)
1375         c1 = byte_after_cr, byte_after_cr = -1;
1376       else
1377         ONE_MORE_BYTE (c1);
1378       if (c1 < 0)
1379         {
1380           c = - c1;
1381         }
1382       else if (UTF_8_1_OCTET_P (c1))
1383         {
1384           if (eol_crlf && c1 == '\r')
1385             ONE_MORE_BYTE (byte_after_cr);
1386           c = c1;
1387         }
1388       else
1389         {
1390           ONE_MORE_BYTE (c2);
1391           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1392             goto invalid_code;
1393           if (UTF_8_2_OCTET_LEADING_P (c1))
1394             {
1395               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1396               /* Reject overlong sequences here and below.  Encoders
1397                  producing them are incorrect, they can be misleading,
1398                  and they mess up read/write invariance.  */
1399               if (c < 128)
1400                 goto invalid_code;
1401             }
1402           else
1403             {
1404               ONE_MORE_BYTE (c3);
1405               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1406                 goto invalid_code;
1407               if (UTF_8_3_OCTET_LEADING_P (c1))
1408                 {
1409                   c = (((c1 & 0xF) << 12)
1410                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1411                   if (c < 0x800
1412                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1413                     goto invalid_code;
1414                 }
1415               else
1416                 {
1417                   ONE_MORE_BYTE (c4);
1418                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1419                     goto invalid_code;
1420                   if (UTF_8_4_OCTET_LEADING_P (c1))
1421                     {
1422                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1423                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1424                     if (c < 0x10000)
1425                       goto invalid_code;
1426                     }
1427                   else
1428                     {
1429                       ONE_MORE_BYTE (c5);
1430                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1431                         goto invalid_code;
1432                       if (UTF_8_5_OCTET_LEADING_P (c1))
1433                         {
1434                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1435                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1436                                | (c5 & 0x3F));
1437                           if ((c > MAX_CHAR) || (c < 0x200000))
1438                             goto invalid_code;
1439                         }
1440                       else
1441                         goto invalid_code;
1442                     }
1443                 }
1444             }
1445         }
1446
1447       *charbuf++ = c;
1448       continue;
1449
1450     invalid_code:
1451       src = src_base;
1452       consumed_chars = consumed_chars_base;
1453       ONE_MORE_BYTE (c);
1454       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1455       coding->errors++;
1456     }
1457
1458  no_more_source:
1459   coding->consumed_char += consumed_chars_base;
1460   coding->consumed = src_base - coding->source;
1461   coding->charbuf_used = charbuf - coding->charbuf;
1462 }
1463
1464
1465 static int
1466 encode_coding_utf_8 (struct coding_system *coding)
1467 {
1468   int multibytep = coding->dst_multibyte;
1469   int *charbuf = coding->charbuf;
1470   int *charbuf_end = charbuf + coding->charbuf_used;
1471   unsigned char *dst = coding->destination + coding->produced;
1472   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1473   int produced_chars = 0;
1474   int c;
1475
1476   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1477     {
1478       ASSURE_DESTINATION (3);
1479       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1480       CODING_UTF_8_BOM (coding) = utf_without_bom;
1481     }
1482
1483   if (multibytep)
1484     {
1485       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1486
1487       while (charbuf < charbuf_end)
1488         {
1489           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1490
1491           ASSURE_DESTINATION (safe_room);
1492           c = *charbuf++;
1493           if (CHAR_BYTE8_P (c))
1494             {
1495               c = CHAR_TO_BYTE8 (c);
1496               EMIT_ONE_BYTE (c);
1497             }
1498           else
1499             {
1500               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1501               for (p = str; p < pend; p++)
1502                 EMIT_ONE_BYTE (*p);
1503             }
1504         }
1505     }
1506   else
1507     {
1508       int safe_room = MAX_MULTIBYTE_LENGTH;
1509
1510       while (charbuf < charbuf_end)
1511         {
1512           ASSURE_DESTINATION (safe_room);
1513           c = *charbuf++;
1514           if (CHAR_BYTE8_P (c))
1515             *dst++ = CHAR_TO_BYTE8 (c);
1516           else
1517             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1518           produced_chars++;
1519         }
1520     }
1521   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1522   coding->produced_char += produced_chars;
1523   coding->produced = dst - coding->destination;
1524   return 0;
1525 }
1526
1527
1528 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1529    Check if a text is encoded in one of UTF-16 based coding systems.
1530    If it is, return 1, else return 0.  */
1531
1532 #define UTF_16_HIGH_SURROGATE_P(val) \
1533   (((val) & 0xFC00) == 0xD800)
1534
1535 #define UTF_16_LOW_SURROGATE_P(val) \
1536   (((val) & 0xFC00) == 0xDC00)
1537
1538 #define UTF_16_INVALID_P(val)   \
1539   (((val) == 0xFFFE)            \
1540    || ((val) == 0xFFFF)         \
1541    || UTF_16_LOW_SURROGATE_P (val))
1542
1543
1544 static int
1545 detect_coding_utf_16 (struct coding_system *coding,
1546                       struct coding_detection_info *detect_info)
1547 {
1548   const unsigned char *src = coding->source;
1549   const unsigned char *src_end = coding->source + coding->src_bytes;
1550   int multibytep = coding->src_multibyte;
1551   int c1, c2;
1552
1553   detect_info->checked |= CATEGORY_MASK_UTF_16;
1554   if (coding->mode & CODING_MODE_LAST_BLOCK
1555       && (coding->src_chars & 1))
1556     {
1557       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1558       return 0;
1559     }
1560
1561   TWO_MORE_BYTES (c1, c2);
1562   if ((c1 == 0xFF) && (c2 == 0xFE))
1563     {
1564       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1565                              | CATEGORY_MASK_UTF_16_AUTO);
1566       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1567                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1568                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1569     }
1570   else if ((c1 == 0xFE) && (c2 == 0xFF))
1571     {
1572       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1573                              | CATEGORY_MASK_UTF_16_AUTO);
1574       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1575                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1576                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1577     }
1578   else if (c2 < 0)
1579     {
1580       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1581       return 0;
1582     }
1583   else
1584     {
1585       /* We check the dispersion of Eth and Oth bytes where E is even and
1586          O is odd.  If both are high, we assume binary data.*/
1587       unsigned char e[256], o[256];
1588       unsigned e_num = 1, o_num = 1;
1589
1590       memset (e, 0, 256);
1591       memset (o, 0, 256);
1592       e[c1] = 1;
1593       o[c2] = 1;
1594
1595       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1596                                 |CATEGORY_MASK_UTF_16_BE
1597                                 | CATEGORY_MASK_UTF_16_LE);
1598
1599       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1600              != CATEGORY_MASK_UTF_16)
1601         {
1602           TWO_MORE_BYTES (c1, c2);
1603           if (c2 < 0)
1604             break;
1605           if (! e[c1])
1606             {
1607               e[c1] = 1;
1608               e_num++;
1609               if (e_num >= 128)
1610                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1611             }
1612           if (! o[c2])
1613             {
1614               o[c2] = 1;
1615               o_num++;
1616               if (o_num >= 128)
1617                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1618             }
1619         }
1620       return 0;
1621     }
1622
1623  no_more_source:
1624   return 1;
1625 }
1626
1627 static void
1628 decode_coding_utf_16 (struct coding_system *coding)
1629 {
1630   const unsigned char *src = coding->source + coding->consumed;
1631   const unsigned char *src_end = coding->source + coding->src_bytes;
1632   const unsigned char *src_base;
1633   int *charbuf = coding->charbuf + coding->charbuf_used;
1634   /* We may produces at most 3 chars in one loop.  */
1635   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1636   int consumed_chars = 0, consumed_chars_base = 0;
1637   int multibytep = coding->src_multibyte;
1638   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1639   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1640   int surrogate = CODING_UTF_16_SURROGATE (coding);
1641   Lisp_Object attr, charset_list;
1642   int eol_crlf =
1643     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1644   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1645
1646   CODING_GET_INFO (coding, attr, charset_list);
1647
1648   if (bom == utf_with_bom)
1649     {
1650       int c, c1, c2;
1651
1652       src_base = src;
1653       ONE_MORE_BYTE (c1);
1654       ONE_MORE_BYTE (c2);
1655       c = (c1 << 8) | c2;
1656
1657       if (endian == utf_16_big_endian
1658           ? c != 0xFEFF : c != 0xFFFE)
1659         {
1660           /* The first two bytes are not BOM.  Treat them as bytes
1661              for a normal character.  */
1662           src = src_base;
1663           coding->errors++;
1664         }
1665       CODING_UTF_16_BOM (coding) = utf_without_bom;
1666     }
1667   else if (bom == utf_detect_bom)
1668     {
1669       /* We have already tried to detect BOM and failed in
1670          detect_coding.  */
1671       CODING_UTF_16_BOM (coding) = utf_without_bom;
1672     }
1673
1674   while (1)
1675     {
1676       int c, c1, c2;
1677
1678       src_base = src;
1679       consumed_chars_base = consumed_chars;
1680
1681       if (charbuf >= charbuf_end)
1682         {
1683           if (byte_after_cr1 >= 0)
1684             src_base -= 2;
1685           break;
1686         }
1687
1688       if (byte_after_cr1 >= 0)
1689         c1 = byte_after_cr1, byte_after_cr1 = -1;
1690       else
1691         ONE_MORE_BYTE (c1);
1692       if (c1 < 0)
1693         {
1694           *charbuf++ = -c1;
1695           continue;
1696         }
1697       if (byte_after_cr2 >= 0)
1698         c2 = byte_after_cr2, byte_after_cr2 = -1;
1699       else
1700         ONE_MORE_BYTE (c2);
1701       if (c2 < 0)
1702         {
1703           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1704           *charbuf++ = -c2;
1705           continue;
1706         }
1707       c = (endian == utf_16_big_endian
1708            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1709
1710       if (surrogate)
1711         {
1712           if (! UTF_16_LOW_SURROGATE_P (c))
1713             {
1714               if (endian == utf_16_big_endian)
1715                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1716               else
1717                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1718               *charbuf++ = c1;
1719               *charbuf++ = c2;
1720               coding->errors++;
1721               if (UTF_16_HIGH_SURROGATE_P (c))
1722                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1723               else
1724                 *charbuf++ = c;
1725             }
1726           else
1727             {
1728               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1729               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1730               *charbuf++ = 0x10000 + c;
1731             }
1732         }
1733       else
1734         {
1735           if (UTF_16_HIGH_SURROGATE_P (c))
1736             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1737           else
1738             {
1739               if (eol_crlf && c == '\r')
1740                 {
1741                   ONE_MORE_BYTE (byte_after_cr1);
1742                   ONE_MORE_BYTE (byte_after_cr2);
1743                 }
1744               *charbuf++ = c;
1745             }
1746         }
1747     }
1748
1749  no_more_source:
1750   coding->consumed_char += consumed_chars_base;
1751   coding->consumed = src_base - coding->source;
1752   coding->charbuf_used = charbuf - coding->charbuf;
1753 }
1754
1755 static int
1756 encode_coding_utf_16 (struct coding_system *coding)
1757 {
1758   int multibytep = coding->dst_multibyte;
1759   int *charbuf = coding->charbuf;
1760   int *charbuf_end = charbuf + coding->charbuf_used;
1761   unsigned char *dst = coding->destination + coding->produced;
1762   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1763   int safe_room = 8;
1764   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1765   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1766   int produced_chars = 0;
1767   Lisp_Object attrs, charset_list;
1768   int c;
1769
1770   CODING_GET_INFO (coding, attrs, charset_list);
1771
1772   if (bom != utf_without_bom)
1773     {
1774       ASSURE_DESTINATION (safe_room);
1775       if (big_endian)
1776         EMIT_TWO_BYTES (0xFE, 0xFF);
1777       else
1778         EMIT_TWO_BYTES (0xFF, 0xFE);
1779       CODING_UTF_16_BOM (coding) = utf_without_bom;
1780     }
1781
1782   while (charbuf < charbuf_end)
1783     {
1784       ASSURE_DESTINATION (safe_room);
1785       c = *charbuf++;
1786       if (c > MAX_UNICODE_CHAR)
1787         c = coding->default_char;
1788
1789       if (c < 0x10000)
1790         {
1791           if (big_endian)
1792             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1793           else
1794             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1795         }
1796       else
1797         {
1798           int c1, c2;
1799
1800           c -= 0x10000;
1801           c1 = (c >> 10) + 0xD800;
1802           c2 = (c & 0x3FF) + 0xDC00;
1803           if (big_endian)
1804             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1805           else
1806             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1807         }
1808     }
1809   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1810   coding->produced = dst - coding->destination;
1811   coding->produced_char += produced_chars;
1812   return 0;
1813 }
1814
1815 \f
1816 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1817
1818 /* Emacs' internal format for representation of multiple character
1819    sets is a kind of multi-byte encoding, i.e. characters are
1820    represented by variable-length sequences of one-byte codes.
1821
1822    ASCII characters and control characters (e.g. `tab', `newline') are
1823    represented by one-byte sequences which are their ASCII codes, in
1824    the range 0x00 through 0x7F.
1825
1826    8-bit characters of the range 0x80..0x9F are represented by
1827    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1828    code + 0x20).
1829
1830    8-bit characters of the range 0xA0..0xFF are represented by
1831    one-byte sequences which are their 8-bit code.
1832
1833    The other characters are represented by a sequence of `base
1834    leading-code', optional `extended leading-code', and one or two
1835    `position-code's.  The length of the sequence is determined by the
1836    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1837    whereas extended leading-code and position-code take the range 0xA0
1838    through 0xFF.  See `charset.h' for more details about leading-code
1839    and position-code.
1840
1841    --- CODE RANGE of Emacs' internal format ---
1842    character set        range
1843    -------------        -----
1844    ascii                0x00..0x7F
1845    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1846    eight-bit-graphic    0xA0..0xBF
1847    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1848    ---------------------------------------------
1849
1850    As this is the internal character representation, the format is
1851    usually not used externally (i.e. in a file or in a data sent to a
1852    process).  But, it is possible to have a text externally in this
1853    format (i.e. by encoding by the coding system `emacs-mule').
1854
1855    In that case, a sequence of one-byte codes has a slightly different
1856    form.
1857
1858    At first, all characters in eight-bit-control are represented by
1859    one-byte sequences which are their 8-bit code.
1860
1861    Next, character composition data are represented by the byte
1862    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1863    where,
1864         METHOD is 0xF2 plus one of composition method (enum
1865         composition_method),
1866
1867         BYTES is 0xA0 plus a byte length of this composition data,
1868
1869         CHARS is 0xA0 plus a number of characters composed by this
1870         data,
1871
1872         COMPONENTs are characters of multibyte form or composition
1873         rules encoded by two-byte of ASCII codes.
1874
1875    In addition, for backward compatibility, the following formats are
1876    also recognized as composition data on decoding.
1877
1878    0x80 MSEQ ...
1879    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1880
1881    Here,
1882         MSEQ is a multibyte form but in these special format:
1883           ASCII: 0xA0 ASCII_CODE+0x80,
1884           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1885         RULE is a one byte code of the range 0xA0..0xF0 that
1886         represents a composition rule.
1887   */
1888
1889 char emacs_mule_bytes[256];
1890
1891
1892 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1893    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1894    else return 0.  */
1895
1896 static int
1897 detect_coding_emacs_mule (struct coding_system *coding,
1898                           struct coding_detection_info *detect_info)
1899 {
1900   const unsigned char *src = coding->source, *src_base;
1901   const unsigned char *src_end = coding->source + coding->src_bytes;
1902   int multibytep = coding->src_multibyte;
1903   int consumed_chars = 0;
1904   int c;
1905   int found = 0;
1906
1907   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1908   /* A coding system of this category is always ASCII compatible.  */
1909   src += coding->head_ascii;
1910
1911   while (1)
1912     {
1913       src_base = src;
1914       ONE_MORE_BYTE (c);
1915       if (c < 0)
1916         continue;
1917       if (c == 0x80)
1918         {
1919           /* Perhaps the start of composite character.  We simply skip
1920              it because analyzing it is too heavy for detecting.  But,
1921              at least, we check that the composite character
1922              constitutes of more than 4 bytes.  */
1923           const unsigned char *src_base;
1924
1925         repeat:
1926           src_base = src;
1927           do
1928             {
1929               ONE_MORE_BYTE (c);
1930             }
1931           while (c >= 0xA0);
1932
1933           if (src - src_base <= 4)
1934             break;
1935           found = CATEGORY_MASK_EMACS_MULE;
1936           if (c == 0x80)
1937             goto repeat;
1938         }
1939
1940       if (c < 0x80)
1941         {
1942           if (c < 0x20
1943               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1944             break;
1945         }
1946       else
1947         {
1948           int more_bytes = emacs_mule_bytes[c] - 1;
1949
1950           while (more_bytes > 0)
1951             {
1952               ONE_MORE_BYTE (c);
1953               if (c < 0xA0)
1954                 {
1955                   src--;        /* Unread the last byte.  */
1956                   break;
1957                 }
1958               more_bytes--;
1959             }
1960           if (more_bytes != 0)
1961             break;
1962           found = CATEGORY_MASK_EMACS_MULE;
1963         }
1964     }
1965   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1966   return 0;
1967
1968  no_more_source:
1969   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1970     {
1971       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1972       return 0;
1973     }
1974   detect_info->found |= found;
1975   return 1;
1976 }
1977
1978
1979 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
1980    character.  If CMP_STATUS indicates that we must expect MSEQ or
1981    RULE described above, decode it and return the negative value of
1982    the decoded character or rule.  If an invalid byte is found, return
1983    -1.  If SRC is too short, return -2.  */
1984
1985 int
1986 emacs_mule_char (struct coding_system *coding, const unsigned char *src,
1987                  int *nbytes, int *nchars, int *id,
1988                  struct composition_status *cmp_status)
1989 {
1990   const unsigned char *src_end = coding->source + coding->src_bytes;
1991   const unsigned char *src_base = src;
1992   int multibytep = coding->src_multibyte;
1993   int charset_id;
1994   unsigned code;
1995   int c;
1996   int consumed_chars = 0;
1997   int mseq_found = 0;
1998
1999   ONE_MORE_BYTE (c);
2000   if (c < 0)
2001     {
2002       c = -c;
2003       charset_id = emacs_mule_charset[0];
2004     }
2005   else
2006     {
2007       if (c >= 0xA0)
2008         {
2009           if (cmp_status->state != COMPOSING_NO
2010               && cmp_status->old_form)
2011             {
2012               if (cmp_status->state == COMPOSING_CHAR)
2013                 {
2014                   if (c == 0xA0)
2015                     {
2016                       ONE_MORE_BYTE (c);
2017                       c -= 0x80;
2018                       if (c < 0)
2019                         goto invalid_code;
2020                     }
2021                   else
2022                     c -= 0x20;
2023                   mseq_found = 1;
2024                 }
2025               else
2026                 {
2027                   *nbytes = src - src_base;
2028                   *nchars = consumed_chars;
2029                   return -c;
2030                 }
2031             }
2032           else
2033             goto invalid_code;
2034         }
2035
2036       switch (emacs_mule_bytes[c])
2037         {
2038         case 2:
2039           if ((charset_id = emacs_mule_charset[c]) < 0)
2040             goto invalid_code;
2041           ONE_MORE_BYTE (c);
2042           if (c < 0xA0)
2043             goto invalid_code;
2044           code = c & 0x7F;
2045           break;
2046
2047         case 3:
2048           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2049               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2050             {
2051               ONE_MORE_BYTE (c);
2052               if (c < 0xA0 || (charset_id = emacs_mule_charset[c]) < 0)
2053                 goto invalid_code;
2054               ONE_MORE_BYTE (c);
2055               if (c < 0xA0)
2056                 goto invalid_code;
2057               code = c & 0x7F;
2058             }
2059           else
2060             {
2061               if ((charset_id = emacs_mule_charset[c]) < 0)
2062                 goto invalid_code;
2063               ONE_MORE_BYTE (c);
2064               if (c < 0xA0)
2065                 goto invalid_code;
2066               code = (c & 0x7F) << 8;
2067               ONE_MORE_BYTE (c);
2068               if (c < 0xA0)
2069                 goto invalid_code;
2070               code |= c & 0x7F;
2071             }
2072           break;
2073
2074         case 4:
2075           ONE_MORE_BYTE (c);
2076           if (c < 0 || (charset_id = emacs_mule_charset[c]) < 0)
2077             goto invalid_code;
2078           ONE_MORE_BYTE (c);
2079           if (c < 0xA0)
2080             goto invalid_code;
2081           code = (c & 0x7F) << 8;
2082           ONE_MORE_BYTE (c);
2083           if (c < 0xA0)
2084             goto invalid_code;
2085           code |= c & 0x7F;
2086           break;
2087
2088         case 1:
2089           code = c;
2090           charset_id = ASCII_BYTE_P (code) ? charset_ascii : charset_eight_bit;
2091           break;
2092
2093         default:
2094           abort ();
2095         }
2096       CODING_DECODE_CHAR (coding, src, src_base, src_end,
2097                           CHARSET_FROM_ID (charset_id), code, c);
2098       if (c < 0)
2099         goto invalid_code;
2100     }
2101   *nbytes = src - src_base;
2102   *nchars = consumed_chars;
2103   if (id)
2104     *id = charset_id;
2105   return (mseq_found ? -c : c);
2106
2107  no_more_source:
2108   return -2;
2109
2110  invalid_code:
2111   return -1;
2112 }
2113
2114
2115 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2116
2117 /* Handle these composition sequence ('|': the end of header elements,
2118    BYTES and CHARS >= 0xA0):
2119
2120    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2121    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2122    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2123
2124    and these old form:
2125
2126    (4) relative composition: 0x80 | MSEQ ... MSEQ
2127    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2128
2129    When the starter 0x80 and the following header elements are found,
2130    this annotation header is produced.
2131
2132         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2133
2134    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2135    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2136
2137    Then, upon reading the following elements, these codes are produced
2138    until the composition end is found:
2139
2140    (1) CHAR ... CHAR
2141    (2) ALT ... ALT CHAR ... CHAR
2142    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2143    (4) CHAR ... CHAR
2144    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2145
2146    When the composition end is found, LENGTH and NCHARS in the
2147    annotation header is updated as below:
2148
2149    (1) LENGTH: unchanged, NCHARS: unchanged
2150    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2151    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2152    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2153    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2154
2155    If an error is found while composing, the annotation header is
2156    changed to the original composition header (plus filler -1s) as
2157    below:
2158
2159    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2160    (5)          [ 0x80 0xFF -1 -1- -1 ]
2161
2162    and the sequence [ -2 DECODED-RULE ] is changed to the original
2163    byte sequence as below:
2164         o the original byte sequence is B: [ B -1 ]
2165         o the original byte sequence is B1 B2: [ B1 B2 ]
2166
2167    Most of the routines are implemented by macros because many
2168    variables and labels in the caller decode_coding_emacs_mule must be
2169    accessible, and they are usually called just once (thus doesn't
2170    increase the size of compiled object).  */
2171
2172 /* Decode a composition rule represented by C as a component of
2173    composition sequence of Emacs 20 style.  Set RULE to the decoded
2174    rule. */
2175
2176 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2177   do {                                                  \
2178     int gref, nref;                                     \
2179                                                         \
2180     c -= 0xA0;                                          \
2181     if (c < 0 || c >= 81)                               \
2182       goto invalid_code;                                \
2183     gref = c / 9, nref = c % 9;                         \
2184     if (gref == 4) gref = 10;                           \
2185     if (nref == 4) nref = 10;                           \
2186     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2187   } while (0)
2188
2189
2190 /* Decode a composition rule represented by C and the following byte
2191    at SRC as a component of composition sequence of Emacs 21 style.
2192    Set RULE to the decoded rule.  */
2193
2194 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2195   do {                                                  \
2196     int gref, nref;                                     \
2197                                                         \
2198     gref = c - 0x20;                                    \
2199     if (gref < 0 || gref >= 81)                         \
2200       goto invalid_code;                                \
2201     ONE_MORE_BYTE (c);                                  \
2202     nref = c - 0x20;                                    \
2203     if (nref < 0 || nref >= 81)                         \
2204       goto invalid_code;                                \
2205     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2206   } while (0)
2207
2208
2209 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2210    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2211    byte length of this composition information, CHARS is the number of
2212    characters composed by this composition.  */
2213
2214 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2215   do {                                                                  \
2216     enum composition_method method = c - 0xF2;                          \
2217     int nbytes, nchars;                                                 \
2218                                                                         \
2219     ONE_MORE_BYTE (c);                                                  \
2220     if (c < 0)                                                          \
2221       goto invalid_code;                                                \
2222     nbytes = c - 0xA0;                                                  \
2223     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2224       goto invalid_code;                                                \
2225     ONE_MORE_BYTE (c);                                                  \
2226     nchars = c - 0xA0;                                                  \
2227     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2228       goto invalid_code;                                                \
2229     cmp_status->old_form = 0;                                           \
2230     cmp_status->method = method;                                        \
2231     if (method == COMPOSITION_RELATIVE)                                 \
2232       cmp_status->state = COMPOSING_CHAR;                               \
2233     else                                                                \
2234       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2235     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2236     cmp_status->nchars = nchars;                                        \
2237     cmp_status->ncomps = nbytes - 4;                                    \
2238     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2239   } while (0)
2240
2241
2242 /* Start of Emacs 20 style format for relative composition.  */
2243
2244 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2245   do {                                                          \
2246     cmp_status->old_form = 1;                                   \
2247     cmp_status->method = COMPOSITION_RELATIVE;                  \
2248     cmp_status->state = COMPOSING_CHAR;                         \
2249     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2250     cmp_status->nchars = cmp_status->ncomps = 0;                \
2251     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2252   } while (0)
2253
2254
2255 /* Start of Emacs 20 style format for rule-base composition.  */
2256
2257 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2258   do {                                                          \
2259     cmp_status->old_form = 1;                                   \
2260     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2261     cmp_status->state = COMPOSING_CHAR;                         \
2262     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2263     cmp_status->nchars = cmp_status->ncomps = 0;                \
2264     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2265   } while (0)
2266
2267
2268 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2269   do {                                                  \
2270     const unsigned char *current_src = src;             \
2271                                                         \
2272     ONE_MORE_BYTE (c);                                  \
2273     if (c < 0)                                          \
2274       goto invalid_code;                                \
2275     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2276         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2277       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2278     else if (c < 0xA0)                                  \
2279       goto invalid_code;                                \
2280     else if (c < 0xC0)                                  \
2281       {                                                 \
2282         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2283         /* Re-read C as a composition component.  */    \
2284         src = current_src;                              \
2285       }                                                 \
2286     else if (c == 0xFF)                                 \
2287       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2288     else                                                \
2289       goto invalid_code;                                \
2290   } while (0)
2291
2292 #define EMACS_MULE_COMPOSITION_END()                            \
2293   do {                                                          \
2294     int idx = - cmp_status->length;                             \
2295                                                                 \
2296     if (cmp_status->old_form)                                   \
2297       charbuf[idx + 2] = cmp_status->nchars;                    \
2298     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2299       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2300     cmp_status->state = COMPOSING_NO;                           \
2301   } while (0)
2302
2303
2304 static int
2305 emacs_mule_finish_composition (int *charbuf,
2306                                struct composition_status *cmp_status)
2307 {
2308   int idx = - cmp_status->length;
2309   int new_chars;
2310
2311   if (cmp_status->old_form && cmp_status->nchars > 0)
2312     {
2313       charbuf[idx + 2] = cmp_status->nchars;
2314       new_chars = 0;
2315       if (cmp_status->method == COMPOSITION_WITH_RULE
2316           && cmp_status->state == COMPOSING_CHAR)
2317         {
2318           /* The last rule was invalid.  */
2319           int rule = charbuf[-1] + 0xA0;
2320
2321           charbuf[-2] = BYTE8_TO_CHAR (rule);
2322           charbuf[-1] = -1;
2323           new_chars = 1;
2324         }
2325     }
2326   else
2327     {
2328       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2329
2330       if (cmp_status->method == COMPOSITION_WITH_RULE)
2331         {
2332           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2333           charbuf[idx++] = -3;
2334           charbuf[idx++] = 0;
2335           new_chars = 1;
2336         }
2337       else
2338         {
2339           int nchars = charbuf[idx + 1] + 0xA0;
2340           int nbytes = charbuf[idx + 2] + 0xA0;
2341
2342           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2343           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2344           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2345           charbuf[idx++] = -1;
2346           new_chars = 4;
2347         }
2348     }
2349   cmp_status->state = COMPOSING_NO;
2350   return new_chars;
2351 }
2352
2353 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2354   do {                                                                    \
2355     if (cmp_status->state != COMPOSING_NO)                                \
2356       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2357   } while (0)
2358
2359
2360 static void
2361 decode_coding_emacs_mule (struct coding_system *coding)
2362 {
2363   const unsigned char *src = coding->source + coding->consumed;
2364   const unsigned char *src_end = coding->source + coding->src_bytes;
2365   const unsigned char *src_base;
2366   int *charbuf = coding->charbuf + coding->charbuf_used;
2367   /* We may produce two annotations (charset and composition) in one
2368      loop and one more charset annotation at the end.  */
2369   int *charbuf_end
2370     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2371   int consumed_chars = 0, consumed_chars_base;
2372   int multibytep = coding->src_multibyte;
2373   Lisp_Object attrs, charset_list;
2374   int char_offset = coding->produced_char;
2375   int last_offset = char_offset;
2376   int last_id = charset_ascii;
2377   int eol_crlf =
2378     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2379   int byte_after_cr = -1;
2380   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2381
2382   CODING_GET_INFO (coding, attrs, charset_list);
2383
2384   if (cmp_status->state != COMPOSING_NO)
2385     {
2386       int i;
2387
2388       for (i = 0; i < cmp_status->length; i++)
2389         *charbuf++ = cmp_status->carryover[i];
2390       coding->annotated = 1;
2391     }
2392
2393   while (1)
2394     {
2395       int c, id;
2396
2397       src_base = src;
2398       consumed_chars_base = consumed_chars;
2399
2400       if (charbuf >= charbuf_end)
2401         {
2402           if (byte_after_cr >= 0)
2403             src_base--;
2404           break;
2405         }
2406
2407       if (byte_after_cr >= 0)
2408         c = byte_after_cr, byte_after_cr = -1;
2409       else
2410         ONE_MORE_BYTE (c);
2411
2412       if (c < 0 || c == 0x80)
2413         {
2414           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2415           if (c < 0)
2416             {
2417               *charbuf++ = -c;
2418               char_offset++;
2419             }
2420           else
2421             DECODE_EMACS_MULE_COMPOSITION_START ();
2422           continue;
2423         }
2424
2425       if (c < 0x80)
2426         {
2427           if (eol_crlf && c == '\r')
2428             ONE_MORE_BYTE (byte_after_cr);
2429           id = charset_ascii;
2430           if (cmp_status->state != COMPOSING_NO)
2431             {
2432               if (cmp_status->old_form)
2433                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2434               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2435                 cmp_status->ncomps--;
2436             }
2437         }
2438       else
2439         {
2440           int nchars, nbytes;
2441           /* emacs_mule_char can load a charset map from a file, which
2442              allocates a large structure and might cause buffer text
2443              to be relocated as result.  Thus, we need to remember the
2444              original pointer to buffer text, and fix up all related
2445              pointers after the call.  */
2446           const unsigned char *orig = coding->source;
2447           EMACS_INT offset;
2448
2449           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2450                                cmp_status);
2451           offset = coding->source - orig;
2452           if (offset)
2453             {
2454               src += offset;
2455               src_base += offset;
2456               src_end += offset;
2457             }
2458           if (c < 0)
2459             {
2460               if (c == -1)
2461                 goto invalid_code;
2462               if (c == -2)
2463                 break;
2464             }
2465           src = src_base + nbytes;
2466           consumed_chars = consumed_chars_base + nchars;
2467           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2468             cmp_status->ncomps -= nchars;
2469         }
2470
2471       /* Now if C >= 0, we found a normally encoded character, if C <
2472          0, we found an old-style composition component character or
2473          rule.  */
2474
2475       if (cmp_status->state == COMPOSING_NO)
2476         {
2477           if (last_id != id)
2478             {
2479               if (last_id != charset_ascii)
2480                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2481                                   last_id);
2482               last_id = id;
2483               last_offset = char_offset;
2484             }
2485           *charbuf++ = c;
2486           char_offset++;
2487         }
2488       else if (cmp_status->state == COMPOSING_CHAR)
2489         {
2490           if (cmp_status->old_form)
2491             {
2492               if (c >= 0)
2493                 {
2494                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2495                   *charbuf++ = c;
2496                   char_offset++;
2497                 }
2498               else
2499                 {
2500                   *charbuf++ = -c;
2501                   cmp_status->nchars++;
2502                   cmp_status->length++;
2503                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2504                     EMACS_MULE_COMPOSITION_END ();
2505                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2506                     cmp_status->state = COMPOSING_RULE;
2507                 }
2508             }
2509           else
2510             {
2511               *charbuf++ = c;
2512               cmp_status->length++;
2513               cmp_status->nchars--;
2514               if (cmp_status->nchars == 0)
2515                 EMACS_MULE_COMPOSITION_END ();
2516             }
2517         }
2518       else if (cmp_status->state == COMPOSING_RULE)
2519         {
2520           int rule;
2521
2522           if (c >= 0)
2523             {
2524               EMACS_MULE_COMPOSITION_END ();
2525               *charbuf++ = c;
2526               char_offset++;
2527             }
2528           else
2529             {
2530               c = -c;
2531               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2532               if (rule < 0)
2533                 goto invalid_code;
2534               *charbuf++ = -2;
2535               *charbuf++ = rule;
2536               cmp_status->length += 2;
2537               cmp_status->state = COMPOSING_CHAR;
2538             }
2539         }
2540       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2541         {
2542           *charbuf++ = c;
2543           cmp_status->length++;
2544           if (cmp_status->ncomps == 0)
2545             cmp_status->state = COMPOSING_CHAR;
2546           else if (cmp_status->ncomps > 0)
2547             {
2548               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2549                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2550             }
2551           else
2552             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2553         }
2554       else                      /* COMPOSING_COMPONENT_RULE */
2555         {
2556           int rule;
2557
2558           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2559           if (rule < 0)
2560             goto invalid_code;
2561           *charbuf++ = -2;
2562           *charbuf++ = rule;
2563           cmp_status->length += 2;
2564           cmp_status->ncomps--;
2565           if (cmp_status->ncomps > 0)
2566             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2567           else
2568             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2569         }
2570       continue;
2571
2572     invalid_code:
2573       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2574       src = src_base;
2575       consumed_chars = consumed_chars_base;
2576       ONE_MORE_BYTE (c);
2577       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2578       char_offset++;
2579       coding->errors++;
2580     }
2581
2582  no_more_source:
2583   if (cmp_status->state != COMPOSING_NO)
2584     {
2585       if (coding->mode & CODING_MODE_LAST_BLOCK)
2586         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2587       else
2588         {
2589           int i;
2590
2591           charbuf -= cmp_status->length;
2592           for (i = 0; i < cmp_status->length; i++)
2593             cmp_status->carryover[i] = charbuf[i];
2594         }
2595     }
2596   if (last_id != charset_ascii)
2597     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2598   coding->consumed_char += consumed_chars_base;
2599   coding->consumed = src_base - coding->source;
2600   coding->charbuf_used = charbuf - coding->charbuf;
2601 }
2602
2603
2604 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2605   do {                                          \
2606     if (id < 0xA0)                              \
2607       codes[0] = id, codes[1] = 0;              \
2608     else if (id < 0xE0)                         \
2609       codes[0] = 0x9A, codes[1] = id;           \
2610     else if (id < 0xF0)                         \
2611       codes[0] = 0x9B, codes[1] = id;           \
2612     else if (id < 0xF5)                         \
2613       codes[0] = 0x9C, codes[1] = id;           \
2614     else                                        \
2615       codes[0] = 0x9D, codes[1] = id;           \
2616   } while (0);
2617
2618
2619 static int
2620 encode_coding_emacs_mule (struct coding_system *coding)
2621 {
2622   int multibytep = coding->dst_multibyte;
2623   int *charbuf = coding->charbuf;
2624   int *charbuf_end = charbuf + coding->charbuf_used;
2625   unsigned char *dst = coding->destination + coding->produced;
2626   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2627   int safe_room = 8;
2628   int produced_chars = 0;
2629   Lisp_Object attrs, charset_list;
2630   int c;
2631   int preferred_charset_id = -1;
2632
2633   CODING_GET_INFO (coding, attrs, charset_list);
2634   if (! EQ (charset_list, Vemacs_mule_charset_list))
2635     {
2636       CODING_ATTR_CHARSET_LIST (attrs)
2637         = charset_list = Vemacs_mule_charset_list;
2638     }
2639
2640   while (charbuf < charbuf_end)
2641     {
2642       ASSURE_DESTINATION (safe_room);
2643       c = *charbuf++;
2644
2645       if (c < 0)
2646         {
2647           /* Handle an annotation.  */
2648           switch (*charbuf)
2649             {
2650             case CODING_ANNOTATE_COMPOSITION_MASK:
2651               /* Not yet implemented.  */
2652               break;
2653             case CODING_ANNOTATE_CHARSET_MASK:
2654               preferred_charset_id = charbuf[3];
2655               if (preferred_charset_id >= 0
2656                   && NILP (Fmemq (make_number (preferred_charset_id),
2657                                   charset_list)))
2658                 preferred_charset_id = -1;
2659               break;
2660             default:
2661               abort ();
2662             }
2663           charbuf += -c - 1;
2664           continue;
2665         }
2666
2667       if (ASCII_CHAR_P (c))
2668         EMIT_ONE_ASCII_BYTE (c);
2669       else if (CHAR_BYTE8_P (c))
2670         {
2671           c = CHAR_TO_BYTE8 (c);
2672           EMIT_ONE_BYTE (c);
2673         }
2674       else
2675         {
2676           struct charset *charset;
2677           unsigned code;
2678           int dimension;
2679           int emacs_mule_id;
2680           unsigned char leading_codes[2];
2681
2682           if (preferred_charset_id >= 0)
2683             {
2684               charset = CHARSET_FROM_ID (preferred_charset_id);
2685               if (CHAR_CHARSET_P (c, charset))
2686                 code = ENCODE_CHAR (charset, c);
2687               else
2688                 charset = char_charset (c, charset_list, &code);
2689             }
2690           else
2691             charset = char_charset (c, charset_list, &code);
2692           if (! charset)
2693             {
2694               c = coding->default_char;
2695               if (ASCII_CHAR_P (c))
2696                 {
2697                   EMIT_ONE_ASCII_BYTE (c);
2698                   continue;
2699                 }
2700               charset = char_charset (c, charset_list, &code);
2701             }
2702           dimension = CHARSET_DIMENSION (charset);
2703           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2704           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2705           EMIT_ONE_BYTE (leading_codes[0]);
2706           if (leading_codes[1])
2707             EMIT_ONE_BYTE (leading_codes[1]);
2708           if (dimension == 1)
2709             EMIT_ONE_BYTE (code | 0x80);
2710           else
2711             {
2712               code |= 0x8080;
2713               EMIT_ONE_BYTE (code >> 8);
2714               EMIT_ONE_BYTE (code & 0xFF);
2715             }
2716         }
2717     }
2718   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2719   coding->produced_char += produced_chars;
2720   coding->produced = dst - coding->destination;
2721   return 0;
2722 }
2723
2724 \f
2725 /*** 7. ISO2022 handlers ***/
2726
2727 /* The following note describes the coding system ISO2022 briefly.
2728    Since the intention of this note is to help understand the
2729    functions in this file, some parts are NOT ACCURATE or are OVERLY
2730    SIMPLIFIED.  For thorough understanding, please refer to the
2731    original document of ISO2022.  This is equivalent to the standard
2732    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2733
2734    ISO2022 provides many mechanisms to encode several character sets
2735    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2736    is encoded using bytes less than 128.  This may make the encoded
2737    text a little bit longer, but the text passes more easily through
2738    several types of gateway, some of which strip off the MSB (Most
2739    Significant Bit).
2740
2741    There are two kinds of character sets: control character sets and
2742    graphic character sets.  The former contain control characters such
2743    as `newline' and `escape' to provide control functions (control
2744    functions are also provided by escape sequences).  The latter
2745    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2746    two control character sets and many graphic character sets.
2747
2748    Graphic character sets are classified into one of the following
2749    four classes, according to the number of bytes (DIMENSION) and
2750    number of characters in one dimension (CHARS) of the set:
2751    - DIMENSION1_CHARS94
2752    - DIMENSION1_CHARS96
2753    - DIMENSION2_CHARS94
2754    - DIMENSION2_CHARS96
2755
2756    In addition, each character set is assigned an identification tag,
2757    unique for each set, called the "final character" (denoted as <F>
2758    hereafter).  The <F> of each character set is decided by ECMA(*)
2759    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2760    (0x30..0x3F are for private use only).
2761
2762    Note (*): ECMA = European Computer Manufacturers Association
2763
2764    Here are examples of graphic character sets [NAME(<F>)]:
2765         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2766         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2767         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2768         o DIMENSION2_CHARS96 -- none for the moment
2769
2770    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2771         C0 [0x00..0x1F] -- control character plane 0
2772         GL [0x20..0x7F] -- graphic character plane 0
2773         C1 [0x80..0x9F] -- control character plane 1
2774         GR [0xA0..0xFF] -- graphic character plane 1
2775
2776    A control character set is directly designated and invoked to C0 or
2777    C1 by an escape sequence.  The most common case is that:
2778    - ISO646's  control character set is designated/invoked to C0, and
2779    - ISO6429's control character set is designated/invoked to C1,
2780    and usually these designations/invocations are omitted in encoded
2781    text.  In a 7-bit environment, only C0 can be used, and a control
2782    character for C1 is encoded by an appropriate escape sequence to
2783    fit into the environment.  All control characters for C1 are
2784    defined to have corresponding escape sequences.
2785
2786    A graphic character set is at first designated to one of four
2787    graphic registers (G0 through G3), then these graphic registers are
2788    invoked to GL or GR.  These designations and invocations can be
2789    done independently.  The most common case is that G0 is invoked to
2790    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2791    these invocations and designations are omitted in encoded text.
2792    In a 7-bit environment, only GL can be used.
2793
2794    When a graphic character set of CHARS94 is invoked to GL, codes
2795    0x20 and 0x7F of the GL area work as control characters SPACE and
2796    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2797    be used.
2798
2799    There are two ways of invocation: locking-shift and single-shift.
2800    With locking-shift, the invocation lasts until the next different
2801    invocation, whereas with single-shift, the invocation affects the
2802    following character only and doesn't affect the locking-shift
2803    state.  Invocations are done by the following control characters or
2804    escape sequences:
2805
2806    ----------------------------------------------------------------------
2807    abbrev  function                  cntrl escape seq   description
2808    ----------------------------------------------------------------------
2809    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2810    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2811    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2812    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2813    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2814    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2815    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2816    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2817    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2818    ----------------------------------------------------------------------
2819    (*) These are not used by any known coding system.
2820
2821    Control characters for these functions are defined by macros
2822    ISO_CODE_XXX in `coding.h'.
2823
2824    Designations are done by the following escape sequences:
2825    ----------------------------------------------------------------------
2826    escape sequence      description
2827    ----------------------------------------------------------------------
2828    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2829    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2830    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2831    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2832    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2833    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2834    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2835    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2836    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2837    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2838    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2839    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2840    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2841    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2842    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2843    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2844    ----------------------------------------------------------------------
2845
2846    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2847    of dimension 1, chars 94, and final character <F>, etc...
2848
2849    Note (*): Although these designations are not allowed in ISO2022,
2850    Emacs accepts them on decoding, and produces them on encoding
2851    CHARS96 character sets in a coding system which is characterized as
2852    7-bit environment, non-locking-shift, and non-single-shift.
2853
2854    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2855    '(' must be omitted.  We refer to this as "short-form" hereafter.
2856
2857    Now you may notice that there are a lot of ways of encoding the
2858    same multilingual text in ISO2022.  Actually, there exist many
2859    coding systems such as Compound Text (used in X11's inter client
2860    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2861    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2862    localized platforms), and all of these are variants of ISO2022.
2863
2864    In addition to the above, Emacs handles two more kinds of escape
2865    sequences: ISO6429's direction specification and Emacs' private
2866    sequence for specifying character composition.
2867
2868    ISO6429's direction specification takes the following form:
2869         o CSI ']'      -- end of the current direction
2870         o CSI '0' ']'  -- end of the current direction
2871         o CSI '1' ']'  -- start of left-to-right text
2872         o CSI '2' ']'  -- start of right-to-left text
2873    The control character CSI (0x9B: control sequence introducer) is
2874    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2875
2876    Character composition specification takes the following form:
2877         o ESC '0' -- start relative composition
2878         o ESC '1' -- end composition
2879         o ESC '2' -- start rule-base composition (*)
2880         o ESC '3' -- start relative composition with alternate chars  (**)
2881         o ESC '4' -- start rule-base composition with alternate chars  (**)
2882   Since these are not standard escape sequences of any ISO standard,
2883   the use of them with these meanings is restricted to Emacs only.
2884
2885   (*) This form is used only in Emacs 20.7 and older versions,
2886   but newer versions can safely decode it.
2887   (**) This form is used only in Emacs 21.1 and newer versions,
2888   and older versions can't decode it.
2889
2890   Here's a list of example usages of these composition escape
2891   sequences (categorized by `enum composition_method').
2892
2893   COMPOSITION_RELATIVE:
2894         ESC 0 CHAR [ CHAR ] ESC 1
2895   COMPOSITION_WITH_RULE:
2896         ESC 2 CHAR [ RULE CHAR ] ESC 1
2897   COMPOSITION_WITH_ALTCHARS:
2898         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2899   COMPOSITION_WITH_RULE_ALTCHARS:
2900         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2901
2902 enum iso_code_class_type iso_code_class[256];
2903
2904 #define SAFE_CHARSET_P(coding, id)      \
2905   ((id) <= (coding)->max_charset_id     \
2906    && (coding)->safe_charsets[id] != 255)
2907
2908
2909 #define SHIFT_OUT_OK(category)  \
2910   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2911
2912 static void
2913 setup_iso_safe_charsets (Lisp_Object attrs)
2914 {
2915   Lisp_Object charset_list, safe_charsets;
2916   Lisp_Object request;
2917   Lisp_Object reg_usage;
2918   Lisp_Object tail;
2919   int reg94, reg96;
2920   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2921   int max_charset_id;
2922
2923   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2924   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2925       && ! EQ (charset_list, Viso_2022_charset_list))
2926     {
2927       CODING_ATTR_CHARSET_LIST (attrs)
2928         = charset_list = Viso_2022_charset_list;
2929       ASET (attrs, coding_attr_safe_charsets, Qnil);
2930     }
2931
2932   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2933     return;
2934
2935   max_charset_id = 0;
2936   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2937     {
2938       int id = XINT (XCAR (tail));
2939       if (max_charset_id < id)
2940         max_charset_id = id;
2941     }
2942
2943   safe_charsets = make_uninit_string (max_charset_id + 1);
2944   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
2945   request = AREF (attrs, coding_attr_iso_request);
2946   reg_usage = AREF (attrs, coding_attr_iso_usage);
2947   reg94 = XINT (XCAR (reg_usage));
2948   reg96 = XINT (XCDR (reg_usage));
2949
2950   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2951     {
2952       Lisp_Object id;
2953       Lisp_Object reg;
2954       struct charset *charset;
2955
2956       id = XCAR (tail);
2957       charset = CHARSET_FROM_ID (XINT (id));
2958       reg = Fcdr (Fassq (id, request));
2959       if (! NILP (reg))
2960         SSET (safe_charsets, XINT (id), XINT (reg));
2961       else if (charset->iso_chars_96)
2962         {
2963           if (reg96 < 4)
2964             SSET (safe_charsets, XINT (id), reg96);
2965         }
2966       else
2967         {
2968           if (reg94 < 4)
2969             SSET (safe_charsets, XINT (id), reg94);
2970         }
2971     }
2972   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2973 }
2974
2975
2976 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2977    Check if a text is encoded in one of ISO-2022 based coding systems.
2978    If it is, return 1, else return 0.  */
2979
2980 static int
2981 detect_coding_iso_2022 (struct coding_system *coding,
2982                         struct coding_detection_info *detect_info)
2983 {
2984   const unsigned char *src = coding->source, *src_base = src;
2985   const unsigned char *src_end = coding->source + coding->src_bytes;
2986   int multibytep = coding->src_multibyte;
2987   int single_shifting = 0;
2988   int id;
2989   int c, c1;
2990   int consumed_chars = 0;
2991   int i;
2992   int rejected = 0;
2993   int found = 0;
2994   int composition_count = -1;
2995
2996   detect_info->checked |= CATEGORY_MASK_ISO;
2997
2998   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2999     {
3000       struct coding_system *this = &(coding_categories[i]);
3001       Lisp_Object attrs, val;
3002
3003       if (this->id < 0)
3004         continue;
3005       attrs = CODING_ID_ATTRS (this->id);
3006       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3007           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3008         setup_iso_safe_charsets (attrs);
3009       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3010       this->max_charset_id = SCHARS (val) - 1;
3011       this->safe_charsets = SDATA (val);
3012     }
3013
3014   /* A coding system of this category is always ASCII compatible.  */
3015   src += coding->head_ascii;
3016
3017   while (rejected != CATEGORY_MASK_ISO)
3018     {
3019       src_base = src;
3020       ONE_MORE_BYTE (c);
3021       switch (c)
3022         {
3023         case ISO_CODE_ESC:
3024           if (inhibit_iso_escape_detection)
3025             break;
3026           single_shifting = 0;
3027           ONE_MORE_BYTE (c);
3028           if (c >= '(' && c <= '/')
3029             {
3030               /* Designation sequence for a charset of dimension 1.  */
3031               ONE_MORE_BYTE (c1);
3032               if (c1 < ' ' || c1 >= 0x80
3033                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3034                 /* Invalid designation sequence.  Just ignore.  */
3035                 break;
3036             }
3037           else if (c == '$')
3038             {
3039               /* Designation sequence for a charset of dimension 2.  */
3040               ONE_MORE_BYTE (c);
3041               if (c >= '@' && c <= 'B')
3042                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3043                 id = iso_charset_table[1][0][c];
3044               else if (c >= '(' && c <= '/')
3045                 {
3046                   ONE_MORE_BYTE (c1);
3047                   if (c1 < ' ' || c1 >= 0x80
3048                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3049                     /* Invalid designation sequence.  Just ignore.  */
3050                     break;
3051                 }
3052               else
3053                 /* Invalid designation sequence.  Just ignore it.  */
3054                 break;
3055             }
3056           else if (c == 'N' || c == 'O')
3057             {
3058               /* ESC <Fe> for SS2 or SS3.  */
3059               single_shifting = 1;
3060               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3061               break;
3062             }
3063           else if (c == '1')
3064             {
3065               /* End of composition.  */
3066               if (composition_count < 0
3067                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3068                 /* Invalid */
3069                 break;
3070               composition_count = -1;
3071               found |= CATEGORY_MASK_ISO;
3072             }
3073           else if (c >= '0' && c <= '4')
3074             {
3075               /* ESC <Fp> for start/end composition.  */
3076               composition_count = 0;
3077               break;
3078             }
3079           else
3080             {
3081               /* Invalid escape sequence.  Just ignore it.  */
3082               break;
3083             }
3084
3085           /* We found a valid designation sequence for CHARSET.  */
3086           rejected |= CATEGORY_MASK_ISO_8BIT;
3087           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3088                               id))
3089             found |= CATEGORY_MASK_ISO_7;
3090           else
3091             rejected |= CATEGORY_MASK_ISO_7;
3092           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3093                               id))
3094             found |= CATEGORY_MASK_ISO_7_TIGHT;
3095           else
3096             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3097           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3098                               id))
3099             found |= CATEGORY_MASK_ISO_7_ELSE;
3100           else
3101             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3102           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3103                               id))
3104             found |= CATEGORY_MASK_ISO_8_ELSE;
3105           else
3106             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3107           break;
3108
3109         case ISO_CODE_SO:
3110         case ISO_CODE_SI:
3111           /* Locking shift out/in.  */
3112           if (inhibit_iso_escape_detection)
3113             break;
3114           single_shifting = 0;
3115           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3116           break;
3117
3118         case ISO_CODE_CSI:
3119           /* Control sequence introducer.  */
3120           single_shifting = 0;
3121           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3122           found |= CATEGORY_MASK_ISO_8_ELSE;
3123           goto check_extra_latin;
3124
3125         case ISO_CODE_SS2:
3126         case ISO_CODE_SS3:
3127           /* Single shift.   */
3128           if (inhibit_iso_escape_detection)
3129             break;
3130           single_shifting = 0;
3131           rejected |= CATEGORY_MASK_ISO_7BIT;
3132           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3133               & CODING_ISO_FLAG_SINGLE_SHIFT)
3134             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3135           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3136               & CODING_ISO_FLAG_SINGLE_SHIFT)
3137             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3138           if (single_shifting)
3139             break;
3140           goto check_extra_latin;
3141
3142         default:
3143           if (c < 0)
3144             continue;
3145           if (c < 0x80)
3146             {
3147               if (composition_count >= 0)
3148                 composition_count++;
3149               single_shifting = 0;
3150               break;
3151             }
3152           if (c >= 0xA0)
3153             {
3154               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3155               found |= CATEGORY_MASK_ISO_8_1;
3156               /* Check the length of succeeding codes of the range
3157                  0xA0..0FF.  If the byte length is even, we include
3158                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3159                  only when we are not single shifting.  */
3160               if (! single_shifting
3161                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3162                 {
3163                   int i = 1;
3164                   while (src < src_end)
3165                     {
3166                       src_base = src;
3167                       ONE_MORE_BYTE (c);
3168                       if (c < 0xA0)
3169                         {
3170                           src = src_base;
3171                           break;
3172                         }
3173                       i++;
3174                     }
3175
3176                   if (i & 1 && src < src_end)
3177                     {
3178                       rejected |= CATEGORY_MASK_ISO_8_2;
3179                       if (composition_count >= 0)
3180                         composition_count += i;
3181                     }
3182                   else
3183                     {
3184                       found |= CATEGORY_MASK_ISO_8_2;
3185                       if (composition_count >= 0)
3186                         composition_count += i / 2;
3187                     }
3188                 }
3189               break;
3190             }
3191         check_extra_latin:
3192           single_shifting = 0;
3193           if (! VECTORP (Vlatin_extra_code_table)
3194               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3195             {
3196               rejected = CATEGORY_MASK_ISO;
3197               break;
3198             }
3199           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3200               & CODING_ISO_FLAG_LATIN_EXTRA)
3201             found |= CATEGORY_MASK_ISO_8_1;
3202           else
3203             rejected |= CATEGORY_MASK_ISO_8_1;
3204           rejected |= CATEGORY_MASK_ISO_8_2;
3205         }
3206     }
3207   detect_info->rejected |= CATEGORY_MASK_ISO;
3208   return 0;
3209
3210  no_more_source:
3211   detect_info->rejected |= rejected;
3212   detect_info->found |= (found & ~rejected);
3213   return 1;
3214 }
3215
3216
3217 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3218    escape sequence should be kept.  */
3219 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3220   do {                                                                  \
3221     int id, prev;                                                       \
3222                                                                         \
3223     if (final < '0' || final >= 128                                     \
3224         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3225         || !SAFE_CHARSET_P (coding, id))                                \
3226       {                                                                 \
3227         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3228         chars_96 = -1;                                                  \
3229         break;                                                          \
3230       }                                                                 \
3231     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3232     if (id == charset_jisx0201_roman)                                   \
3233       {                                                                 \
3234         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3235           id = charset_ascii;                                           \
3236       }                                                                 \
3237     else if (id == charset_jisx0208_1978)                               \
3238       {                                                                 \
3239         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3240           id = charset_jisx0208;                                        \
3241       }                                                                 \
3242     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3243     /* If there was an invalid designation to REG previously, and this  \
3244        designation is ASCII to REG, we should keep this designation     \
3245        sequence.  */                                                    \
3246     if (prev == -2 && id == charset_ascii)                              \
3247       chars_96 = -1;                                                    \
3248   } while (0)
3249
3250
3251 /* Handle these composition sequence (ALT: alternate char):
3252
3253    (1) relative composition: ESC 0 CHAR ... ESC 1
3254    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3255    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3256    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3257
3258    When the start sequence (ESC 0/2/3/4) is found, this annotation
3259    header is produced.
3260
3261         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3262
3263    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3264    produced until the end sequence (ESC 1) is found:
3265
3266    (1) CHAR ... CHAR
3267    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3268    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3269    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3270
3271    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3272    annotation header is updated as below:
3273
3274    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3275    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3276    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3277    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3278
3279    If an error is found while composing, the annotation header is
3280    changed to:
3281
3282         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3283
3284    and the sequence [ -2 DECODED-RULE ] is changed to the original
3285    byte sequence as below:
3286         o the original byte sequence is B: [ B -1 ]
3287         o the original byte sequence is B1 B2: [ B1 B2 ]
3288    and the sequence [ -1 -1 ] is changed to the original byte
3289    sequence:
3290         [ ESC '0' ]
3291 */
3292
3293 /* Decode a composition rule C1 and maybe one more byte from the
3294    source, and set RULE to the encoded composition rule, NBYTES to the
3295    length of the composition rule.  If the rule is invalid, set RULE
3296    to some negative value.  */
3297
3298 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3299   do {                                                                  \
3300     rule = c1 - 32;                                                     \
3301     if (rule < 0)                                                       \
3302       break;                                                            \
3303     if (rule < 81)              /* old format (before ver.21) */        \
3304       {                                                                 \
3305         int gref = (rule) / 9;                                          \
3306         int nref = (rule) % 9;                                          \
3307         if (gref == 4) gref = 10;                                       \
3308         if (nref == 4) nref = 10;                                       \
3309         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3310         nbytes = 1;                                                     \
3311       }                                                                 \
3312     else                        /* new format (after ver.21) */         \
3313       {                                                                 \
3314         int c;                                                          \
3315                                                                         \
3316         ONE_MORE_BYTE (c);                                              \
3317         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3318         if (rule >= 0)                                                  \
3319           rule += 0x100;   /* to destinguish it from the old format */  \
3320         nbytes = 2;                                                     \
3321       }                                                                 \
3322   } while (0)
3323
3324 #define ENCODE_COMPOSITION_RULE(rule)                           \
3325   do {                                                          \
3326     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3327                                                                 \
3328     if (rule < 0x100)           /* old format */                \
3329       {                                                         \
3330         if (gref == 10) gref = 4;                               \
3331         if (nref == 10) nref = 4;                               \
3332         charbuf[idx] = 32 + gref * 9 + nref;                    \
3333         charbuf[idx + 1] = -1;                                  \
3334         new_chars++;                                            \
3335       }                                                         \
3336     else                                /* new format */        \
3337       {                                                         \
3338         charbuf[idx] = 32 + 81 + gref;                          \
3339         charbuf[idx + 1] = 32 + nref;                           \
3340         new_chars += 2;                                         \
3341       }                                                         \
3342   } while (0)
3343
3344 /* Finish the current composition as invalid.  */
3345
3346 static int finish_composition (int *, struct composition_status *);
3347
3348 static int
3349 finish_composition (int *charbuf, struct composition_status *cmp_status)
3350 {
3351   int idx = - cmp_status->length;
3352   int new_chars;
3353
3354   /* Recover the original ESC sequence */
3355   charbuf[idx++] = ISO_CODE_ESC;
3356   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3357                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3358                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3359                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3360                     : '4');
3361   charbuf[idx++] = -2;
3362   charbuf[idx++] = 0;
3363   charbuf[idx++] = -1;
3364   new_chars = cmp_status->nchars;
3365   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3366     for (; idx < 0; idx++)
3367       {
3368         int elt = charbuf[idx];
3369
3370         if (elt == -2)
3371           {
3372             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3373             idx++;
3374           }
3375         else if (elt == -1)
3376           {
3377             charbuf[idx++] = ISO_CODE_ESC;
3378             charbuf[idx] = '0';
3379             new_chars += 2;
3380           }
3381       }
3382   cmp_status->state = COMPOSING_NO;
3383   return new_chars;
3384 }
3385
3386 /* If characters are under composition, finish the composition.  */
3387 #define MAYBE_FINISH_COMPOSITION()                              \
3388   do {                                                          \
3389     if (cmp_status->state != COMPOSING_NO)                      \
3390       char_offset += finish_composition (charbuf, cmp_status);  \
3391   } while (0)
3392
3393 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3394
3395    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3396    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3397    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3398    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3399
3400    Produce this annotation sequence now:
3401
3402    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3403 */
3404
3405 #define DECODE_COMPOSITION_START(c1)                                       \
3406   do {                                                                     \
3407     if (c1 == '0'                                                          \
3408         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3409              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3410             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3411                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3412       {                                                                    \
3413         *charbuf++ = -1;                                                   \
3414         *charbuf++= -1;                                                    \
3415         cmp_status->state = COMPOSING_CHAR;                                \
3416         cmp_status->length += 2;                                           \
3417       }                                                                    \
3418     else                                                                   \
3419       {                                                                    \
3420         MAYBE_FINISH_COMPOSITION ();                                       \
3421         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3422                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3423                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3424                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3425         cmp_status->state                                                  \
3426           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3427         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3428         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3429         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3430         coding->annotated = 1;                                             \
3431       }                                                                    \
3432   } while (0)
3433
3434
3435 /* Handle composition end sequence ESC 1.  */
3436
3437 #define DECODE_COMPOSITION_END()                                        \
3438   do {                                                                  \
3439     if (cmp_status->nchars == 0                                         \
3440         || ((cmp_status->state == COMPOSING_CHAR)                       \
3441             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3442       {                                                                 \
3443         MAYBE_FINISH_COMPOSITION ();                                    \
3444         goto invalid_code;                                              \
3445       }                                                                 \
3446     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3447       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3448     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3449       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3450     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3451     char_offset += cmp_status->nchars;                                  \
3452     cmp_status->state = COMPOSING_NO;                                   \
3453   } while (0)
3454
3455 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3456
3457 #define STORE_COMPOSITION_RULE(rule)    \
3458   do {                                  \
3459     *charbuf++ = -2;                    \
3460     *charbuf++ = rule;                  \
3461     cmp_status->length += 2;            \
3462     cmp_status->state--;                \
3463   } while (0)
3464
3465 /* Store a composed char or a component char C in charbuf, and update
3466    cmp_status.  */
3467
3468 #define STORE_COMPOSITION_CHAR(c)                                       \
3469   do {                                                                  \
3470     *charbuf++ = (c);                                                   \
3471     cmp_status->length++;                                               \
3472     if (cmp_status->state == COMPOSING_CHAR)                            \
3473       cmp_status->nchars++;                                             \
3474     else                                                                \
3475       cmp_status->ncomps++;                                             \
3476     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3477         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3478             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3479       cmp_status->state++;                                              \
3480   } while (0)
3481
3482
3483 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3484
3485 static void
3486 decode_coding_iso_2022 (struct coding_system *coding)
3487 {
3488   const unsigned char *src = coding->source + coding->consumed;
3489   const unsigned char *src_end = coding->source + coding->src_bytes;
3490   const unsigned char *src_base;
3491   int *charbuf = coding->charbuf + coding->charbuf_used;
3492   /* We may produce two annotations (charset and composition) in one
3493      loop and one more charset annotation at the end.  */
3494   int *charbuf_end
3495     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3496   int consumed_chars = 0, consumed_chars_base;
3497   int multibytep = coding->src_multibyte;
3498   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3499   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3500   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3501   int charset_id_2, charset_id_3;
3502   struct charset *charset;
3503   int c;
3504   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3505   Lisp_Object attrs, charset_list;
3506   int char_offset = coding->produced_char;
3507   int last_offset = char_offset;
3508   int last_id = charset_ascii;
3509   int eol_crlf =
3510     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3511   int byte_after_cr = -1;
3512   int i;
3513
3514   CODING_GET_INFO (coding, attrs, charset_list);
3515   setup_iso_safe_charsets (attrs);
3516   /* Charset list may have been changed.  */
3517   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3518   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3519
3520   if (cmp_status->state != COMPOSING_NO)
3521     {
3522       for (i = 0; i < cmp_status->length; i++)
3523         *charbuf++ = cmp_status->carryover[i];
3524       coding->annotated = 1;
3525     }
3526
3527   while (1)
3528     {
3529       int c1, c2, c3;
3530
3531       src_base = src;
3532       consumed_chars_base = consumed_chars;
3533
3534       if (charbuf >= charbuf_end)
3535         {
3536           if (byte_after_cr >= 0)
3537             src_base--;
3538           break;
3539         }
3540
3541       if (byte_after_cr >= 0)
3542         c1 = byte_after_cr, byte_after_cr = -1;
3543       else
3544         ONE_MORE_BYTE (c1);
3545       if (c1 < 0)
3546         goto invalid_code;
3547
3548       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3549         {
3550           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3551           char_offset++;
3552           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3553           continue;
3554         }
3555
3556       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3557         {
3558           if (c1 == ISO_CODE_ESC)
3559             {
3560               if (src + 1 >= src_end)
3561                 goto no_more_source;
3562               *charbuf++ = ISO_CODE_ESC;
3563               char_offset++;
3564               if (src[0] == '%' && src[1] == '@')
3565                 {
3566                   src += 2;
3567                   consumed_chars += 2;
3568                   char_offset += 2;
3569                   /* We are sure charbuf can contain two more chars. */
3570                   *charbuf++ = '%';
3571                   *charbuf++ = '@';
3572                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3573                 }
3574             }
3575           else
3576             {
3577               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3578               char_offset++;
3579             }
3580           continue;
3581         }
3582
3583       if ((cmp_status->state == COMPOSING_RULE
3584            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3585           && c1 != ISO_CODE_ESC)
3586         {
3587           int rule, nbytes;
3588
3589           DECODE_COMPOSITION_RULE (rule, nbytes);
3590           if (rule < 0)
3591             goto invalid_code;
3592           STORE_COMPOSITION_RULE (rule);
3593           continue;
3594         }
3595
3596       /* We produce at most one character.  */
3597       switch (iso_code_class [c1])
3598         {
3599         case ISO_0x20_or_0x7F:
3600           if (charset_id_0 < 0
3601               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3602             /* This is SPACE or DEL.  */
3603             charset = CHARSET_FROM_ID (charset_ascii);
3604           else
3605             charset = CHARSET_FROM_ID (charset_id_0);
3606           break;
3607
3608         case ISO_graphic_plane_0:
3609           if (charset_id_0 < 0)
3610             charset = CHARSET_FROM_ID (charset_ascii);
3611           else
3612             charset = CHARSET_FROM_ID (charset_id_0);
3613           break;
3614
3615         case ISO_0xA0_or_0xFF:
3616           if (charset_id_1 < 0
3617               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3618               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3619             goto invalid_code;
3620           /* This is a graphic character, we fall down ... */
3621
3622         case ISO_graphic_plane_1:
3623           if (charset_id_1 < 0)
3624             goto invalid_code;
3625           charset = CHARSET_FROM_ID (charset_id_1);
3626           break;
3627
3628         case ISO_control_0:
3629           if (eol_crlf && c1 == '\r')
3630             ONE_MORE_BYTE (byte_after_cr);
3631           MAYBE_FINISH_COMPOSITION ();
3632           charset = CHARSET_FROM_ID (charset_ascii);
3633           break;
3634
3635         case ISO_control_1:
3636           goto invalid_code;
3637
3638         case ISO_shift_out:
3639           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3640               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3641             goto invalid_code;
3642           CODING_ISO_INVOCATION (coding, 0) = 1;
3643           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3644           continue;
3645
3646         case ISO_shift_in:
3647           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3648             goto invalid_code;
3649           CODING_ISO_INVOCATION (coding, 0) = 0;
3650           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3651           continue;
3652
3653         case ISO_single_shift_2_7:
3654           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3655             goto invalid_code;
3656         case ISO_single_shift_2:
3657           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3658             goto invalid_code;
3659           /* SS2 is handled as an escape sequence of ESC 'N' */
3660           c1 = 'N';
3661           goto label_escape_sequence;
3662
3663         case ISO_single_shift_3:
3664           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3665             goto invalid_code;
3666           /* SS2 is handled as an escape sequence of ESC 'O' */
3667           c1 = 'O';
3668           goto label_escape_sequence;
3669
3670         case ISO_control_sequence_introducer:
3671           /* CSI is handled as an escape sequence of ESC '[' ...  */
3672           c1 = '[';
3673           goto label_escape_sequence;
3674
3675         case ISO_escape:
3676           ONE_MORE_BYTE (c1);
3677         label_escape_sequence:
3678           /* Escape sequences handled here are invocation,
3679              designation, direction specification, and character
3680              composition specification.  */
3681           switch (c1)
3682             {
3683             case '&':           /* revision of following character set */
3684               ONE_MORE_BYTE (c1);
3685               if (!(c1 >= '@' && c1 <= '~'))
3686                 goto invalid_code;
3687               ONE_MORE_BYTE (c1);
3688               if (c1 != ISO_CODE_ESC)
3689                 goto invalid_code;
3690               ONE_MORE_BYTE (c1);
3691               goto label_escape_sequence;
3692
3693             case '$':           /* designation of 2-byte character set */
3694               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3695                 goto invalid_code;
3696               {
3697                 int reg, chars96;
3698
3699                 ONE_MORE_BYTE (c1);
3700                 if (c1 >= '@' && c1 <= 'B')
3701                   {     /* designation of JISX0208.1978, GB2312.1980,
3702                            or JISX0208.1980 */
3703                     reg = 0, chars96 = 0;
3704                   }
3705                 else if (c1 >= 0x28 && c1 <= 0x2B)
3706                   { /* designation of DIMENSION2_CHARS94 character set */
3707                     reg = c1 - 0x28, chars96 = 0;
3708                     ONE_MORE_BYTE (c1);
3709                   }
3710                 else if (c1 >= 0x2C && c1 <= 0x2F)
3711                   { /* designation of DIMENSION2_CHARS96 character set */
3712                     reg = c1 - 0x2C, chars96 = 1;
3713                     ONE_MORE_BYTE (c1);
3714                   }
3715                 else
3716                   goto invalid_code;
3717                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3718                 /* We must update these variables now.  */
3719                 if (reg == 0)
3720                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3721                 else if (reg == 1)
3722                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3723                 if (chars96 < 0)
3724                   goto invalid_code;
3725               }
3726               continue;
3727
3728             case 'n':           /* invocation of locking-shift-2 */
3729               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3730                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3731                 goto invalid_code;
3732               CODING_ISO_INVOCATION (coding, 0) = 2;
3733               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3734               continue;
3735
3736             case 'o':           /* invocation of locking-shift-3 */
3737               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3738                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3739                 goto invalid_code;
3740               CODING_ISO_INVOCATION (coding, 0) = 3;
3741               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3742               continue;
3743
3744             case 'N':           /* invocation of single-shift-2 */
3745               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3746                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3747                 goto invalid_code;
3748               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3749               if (charset_id_2 < 0)
3750                 charset = CHARSET_FROM_ID (charset_ascii);
3751               else
3752                 charset = CHARSET_FROM_ID (charset_id_2);
3753               ONE_MORE_BYTE (c1);
3754               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3755                 goto invalid_code;
3756               break;
3757
3758             case 'O':           /* invocation of single-shift-3 */
3759               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3760                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3761                 goto invalid_code;
3762               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3763               if (charset_id_3 < 0)
3764                 charset = CHARSET_FROM_ID (charset_ascii);
3765               else
3766                 charset = CHARSET_FROM_ID (charset_id_3);
3767               ONE_MORE_BYTE (c1);
3768               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3769                 goto invalid_code;
3770               break;
3771
3772             case '0': case '2': case '3': case '4': /* start composition */
3773               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3774                 goto invalid_code;
3775               if (last_id != charset_ascii)
3776                 {
3777                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3778                   last_id = charset_ascii;
3779                   last_offset = char_offset;
3780                 }
3781               DECODE_COMPOSITION_START (c1);
3782               continue;
3783
3784             case '1':           /* end composition */
3785               if (cmp_status->state == COMPOSING_NO)
3786                 goto invalid_code;
3787               DECODE_COMPOSITION_END ();
3788               continue;
3789
3790             case '[':           /* specification of direction */
3791               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3792                 goto invalid_code;
3793               /* For the moment, nested direction is not supported.
3794                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3795                  left-to-right, and nonzero means right-to-left.  */
3796               ONE_MORE_BYTE (c1);
3797               switch (c1)
3798                 {
3799                 case ']':       /* end of the current direction */
3800                   coding->mode &= ~CODING_MODE_DIRECTION;
3801
3802                 case '0':       /* end of the current direction */
3803                 case '1':       /* start of left-to-right direction */
3804                   ONE_MORE_BYTE (c1);
3805                   if (c1 == ']')
3806                     coding->mode &= ~CODING_MODE_DIRECTION;
3807                   else
3808                     goto invalid_code;
3809                   break;
3810
3811                 case '2':       /* start of right-to-left direction */
3812                   ONE_MORE_BYTE (c1);
3813                   if (c1 == ']')
3814                     coding->mode |= CODING_MODE_DIRECTION;
3815                   else
3816                     goto invalid_code;
3817                   break;
3818
3819                 default:
3820                   goto invalid_code;
3821                 }
3822               continue;
3823
3824             case '%':
3825               ONE_MORE_BYTE (c1);
3826               if (c1 == '/')
3827                 {
3828                   /* CTEXT extended segment:
3829                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3830                      We keep these bytes as is for the moment.
3831                      They may be decoded by post-read-conversion.  */
3832                   int dim, M, L;
3833                   int size;
3834
3835                   ONE_MORE_BYTE (dim);
3836                   if (dim < '0' || dim > '4')
3837                     goto invalid_code;
3838                   ONE_MORE_BYTE (M);
3839                   if (M < 128)
3840                     goto invalid_code;
3841                   ONE_MORE_BYTE (L);
3842                   if (L < 128)
3843                     goto invalid_code;
3844                   size = ((M - 128) * 128) + (L - 128);
3845                   if (charbuf + 6 > charbuf_end)
3846                     goto break_loop;
3847                   *charbuf++ = ISO_CODE_ESC;
3848                   *charbuf++ = '%';
3849                   *charbuf++ = '/';
3850                   *charbuf++ = dim;
3851                   *charbuf++ = BYTE8_TO_CHAR (M);
3852                   *charbuf++ = BYTE8_TO_CHAR (L);
3853                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3854                 }
3855               else if (c1 == 'G')
3856                 {
3857                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3858                      ESC % G --UTF-8-BYTES-- ESC % @
3859                      We keep these bytes as is for the moment.
3860                      They may be decoded by post-read-conversion.  */
3861                   if (charbuf + 3 > charbuf_end)
3862                     goto break_loop;
3863                   *charbuf++ = ISO_CODE_ESC;
3864                   *charbuf++ = '%';
3865                   *charbuf++ = 'G';
3866                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3867                 }
3868               else
3869                 goto invalid_code;
3870               continue;
3871               break;
3872
3873             default:
3874               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3875                 goto invalid_code;
3876               {
3877                 int reg, chars96;
3878
3879                 if (c1 >= 0x28 && c1 <= 0x2B)
3880                   { /* designation of DIMENSION1_CHARS94 character set */
3881                     reg = c1 - 0x28, chars96 = 0;
3882                     ONE_MORE_BYTE (c1);
3883                   }
3884                 else if (c1 >= 0x2C && c1 <= 0x2F)
3885                   { /* designation of DIMENSION1_CHARS96 character set */
3886                     reg = c1 - 0x2C, chars96 = 1;
3887                     ONE_MORE_BYTE (c1);
3888                   }
3889                 else
3890                   goto invalid_code;
3891                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3892                 /* We must update these variables now.  */
3893                 if (reg == 0)
3894                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3895                 else if (reg == 1)
3896                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3897                 if (chars96 < 0)
3898                   goto invalid_code;
3899               }
3900               continue;
3901             }
3902         }
3903
3904       if (cmp_status->state == COMPOSING_NO
3905           && charset->id != charset_ascii
3906           && last_id != charset->id)
3907         {
3908           if (last_id != charset_ascii)
3909             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3910           last_id = charset->id;
3911           last_offset = char_offset;
3912         }
3913
3914       /* Now we know CHARSET and 1st position code C1 of a character.
3915          Produce a decoded character while getting 2nd and 3rd
3916          position codes C2, C3 if necessary.  */
3917       if (CHARSET_DIMENSION (charset) > 1)
3918         {
3919           ONE_MORE_BYTE (c2);
3920           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3921               || ((c1 & 0x80) != (c2 & 0x80)))
3922             /* C2 is not in a valid range.  */
3923             goto invalid_code;
3924           if (CHARSET_DIMENSION (charset) == 2)
3925             c1 = (c1 << 8) | c2;
3926           else
3927             {
3928               ONE_MORE_BYTE (c3);
3929               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
3930                   || ((c1 & 0x80) != (c3 & 0x80)))
3931                 /* C3 is not in a valid range.  */
3932                 goto invalid_code;
3933               c1 = (c1 << 16) | (c2 << 8) | c2;
3934             }
3935         }
3936       c1 &= 0x7F7F7F;
3937       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3938       if (c < 0)
3939         {
3940           MAYBE_FINISH_COMPOSITION ();
3941           for (; src_base < src; src_base++, char_offset++)
3942             {
3943               if (ASCII_BYTE_P (*src_base))
3944                 *charbuf++ = *src_base;
3945               else
3946                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3947             }
3948         }
3949       else if (cmp_status->state == COMPOSING_NO)
3950         {
3951           *charbuf++ = c;
3952           char_offset++;
3953         }
3954       else if ((cmp_status->state == COMPOSING_CHAR
3955                 ? cmp_status->nchars
3956                 : cmp_status->ncomps)
3957                >= MAX_COMPOSITION_COMPONENTS)
3958         {
3959           /* Too long composition.  */
3960           MAYBE_FINISH_COMPOSITION ();
3961           *charbuf++ = c;
3962           char_offset++;
3963         }
3964       else
3965         STORE_COMPOSITION_CHAR (c);
3966       continue;
3967
3968     invalid_code:
3969       MAYBE_FINISH_COMPOSITION ();
3970       src = src_base;
3971       consumed_chars = consumed_chars_base;
3972       ONE_MORE_BYTE (c);
3973       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3974       char_offset++;
3975       coding->errors++;
3976       continue;
3977
3978     break_loop:
3979       break;
3980     }
3981
3982  no_more_source:
3983   if (cmp_status->state != COMPOSING_NO)
3984     {
3985       if (coding->mode & CODING_MODE_LAST_BLOCK)
3986         MAYBE_FINISH_COMPOSITION ();
3987       else
3988         {
3989           charbuf -= cmp_status->length;
3990           for (i = 0; i < cmp_status->length; i++)
3991             cmp_status->carryover[i] = charbuf[i];
3992         }
3993     }
3994   else if (last_id != charset_ascii)
3995     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3996   coding->consumed_char += consumed_chars_base;
3997   coding->consumed = src_base - coding->source;
3998   coding->charbuf_used = charbuf - coding->charbuf;
3999 }
4000
4001
4002 /* ISO2022 encoding stuff.  */
4003
4004 /*
4005    It is not enough to say just "ISO2022" on encoding, we have to
4006    specify more details.  In Emacs, each coding system of ISO2022
4007    variant has the following specifications:
4008         1. Initial designation to G0 thru G3.
4009         2. Allows short-form designation?
4010         3. ASCII should be designated to G0 before control characters?
4011         4. ASCII should be designated to G0 at end of line?
4012         5. 7-bit environment or 8-bit environment?
4013         6. Use locking-shift?
4014         7. Use Single-shift?
4015    And the following two are only for Japanese:
4016         8. Use ASCII in place of JIS0201-1976-Roman?
4017         9. Use JISX0208-1983 in place of JISX0208-1978?
4018    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4019    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4020    details.
4021 */
4022
4023 /* Produce codes (escape sequence) for designating CHARSET to graphic
4024    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4025    '@', 'A', or 'B' and the coding system CODING allows, produce
4026    designation sequence of short-form.  */
4027
4028 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4029   do {                                                                  \
4030     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4031     const char *intermediate_char_94 = "()*+";                          \
4032     const char *intermediate_char_96 = ",-./";                          \
4033     int revision = -1;                                                  \
4034     int c;                                                              \
4035                                                                         \
4036     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4037       revision = CHARSET_ISO_REVISION (charset);                        \
4038                                                                         \
4039     if (revision >= 0)                                                  \
4040       {                                                                 \
4041         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4042         EMIT_ONE_BYTE ('@' + revision);                                 \
4043       }                                                                 \
4044     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4045     if (CHARSET_DIMENSION (charset) == 1)                               \
4046       {                                                                 \
4047         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4048           c = intermediate_char_94[reg];                                \
4049         else                                                            \
4050           c = intermediate_char_96[reg];                                \
4051         EMIT_ONE_ASCII_BYTE (c);                                        \
4052       }                                                                 \
4053     else                                                                \
4054       {                                                                 \
4055         EMIT_ONE_ASCII_BYTE ('$');                                      \
4056         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4057           {                                                             \
4058             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4059                 || reg != 0                                             \
4060                 || final_char < '@' || final_char > 'B')                \
4061               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4062           }                                                             \
4063         else                                                            \
4064           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4065       }                                                                 \
4066     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4067                                                                         \
4068     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4069   } while (0)
4070
4071
4072 /* The following two macros produce codes (control character or escape
4073    sequence) for ISO2022 single-shift functions (single-shift-2 and
4074    single-shift-3).  */
4075
4076 #define ENCODE_SINGLE_SHIFT_2                                           \
4077   do {                                                                  \
4078     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4079       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4080     else                                                                \
4081       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4082     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4083   } while (0)
4084
4085
4086 #define ENCODE_SINGLE_SHIFT_3                                           \
4087   do {                                                                  \
4088     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4089       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4090     else                                                                \
4091       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4092     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4093   } while (0)
4094
4095
4096 /* The following four macros produce codes (control character or
4097    escape sequence) for ISO2022 locking-shift functions (shift-in,
4098    shift-out, locking-shift-2, and locking-shift-3).  */
4099
4100 #define ENCODE_SHIFT_IN                                 \
4101   do {                                                  \
4102     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4103     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4104   } while (0)
4105
4106
4107 #define ENCODE_SHIFT_OUT                                \
4108   do {                                                  \
4109     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4110     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4111   } while (0)
4112
4113
4114 #define ENCODE_LOCKING_SHIFT_2                          \
4115   do {                                                  \
4116     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4117     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4118   } while (0)
4119
4120
4121 #define ENCODE_LOCKING_SHIFT_3                          \
4122   do {                                                  \
4123     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4124     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4125   } while (0)
4126
4127
4128 /* Produce codes for a DIMENSION1 character whose character set is
4129    CHARSET and whose position-code is C1.  Designation and invocation
4130    sequences are also produced in advance if necessary.  */
4131
4132 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4133   do {                                                                  \
4134     int id = CHARSET_ID (charset);                                      \
4135                                                                         \
4136     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4137         && id == charset_ascii)                                         \
4138       {                                                                 \
4139         id = charset_jisx0201_roman;                                    \
4140         charset = CHARSET_FROM_ID (id);                                 \
4141       }                                                                 \
4142                                                                         \
4143     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4144       {                                                                 \
4145         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4146           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4147         else                                                            \
4148           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4149         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4150         break;                                                          \
4151       }                                                                 \
4152     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4153       {                                                                 \
4154         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4155         break;                                                          \
4156       }                                                                 \
4157     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4158       {                                                                 \
4159         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4160         break;                                                          \
4161       }                                                                 \
4162     else                                                                \
4163       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4164          must invoke it, or, at first, designate it to some graphic     \
4165          register.  Then repeat the loop to actually produce the        \
4166          character.  */                                                 \
4167       dst = encode_invocation_designation (charset, coding, dst,        \
4168                                            &produced_chars);            \
4169   } while (1)
4170
4171
4172 /* Produce codes for a DIMENSION2 character whose character set is
4173    CHARSET and whose position-codes are C1 and C2.  Designation and
4174    invocation codes are also produced in advance if necessary.  */
4175
4176 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4177   do {                                                                  \
4178     int id = CHARSET_ID (charset);                                      \
4179                                                                         \
4180     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4181         && id == charset_jisx0208)                                      \
4182       {                                                                 \
4183         id = charset_jisx0208_1978;                                     \
4184         charset = CHARSET_FROM_ID (id);                                 \
4185       }                                                                 \
4186                                                                         \
4187     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4188       {                                                                 \
4189         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4190           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4191         else                                                            \
4192           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4193         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4194         break;                                                          \
4195       }                                                                 \
4196     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4197       {                                                                 \
4198         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4199         break;                                                          \
4200       }                                                                 \
4201     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4202       {                                                                 \
4203         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4204         break;                                                          \
4205       }                                                                 \
4206     else                                                                \
4207       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4208          must invoke it, or, at first, designate it to some graphic     \
4209          register.  Then repeat the loop to actually produce the        \
4210          character.  */                                                 \
4211       dst = encode_invocation_designation (charset, coding, dst,        \
4212                                            &produced_chars);            \
4213   } while (1)
4214
4215
4216 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4217   do {                                                                     \
4218     int code = ENCODE_CHAR ((charset), (c));                               \
4219                                                                            \
4220     if (CHARSET_DIMENSION (charset) == 1)                                  \
4221       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4222     else                                                                   \
4223       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4224   } while (0)
4225
4226
4227 /* Produce designation and invocation codes at a place pointed by DST
4228    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4229    Return new DST.  */
4230
4231 unsigned char *
4232 encode_invocation_designation (struct charset *charset,
4233                                struct coding_system *coding,
4234                                unsigned char *dst, int *p_nchars)
4235 {
4236   int multibytep = coding->dst_multibyte;
4237   int produced_chars = *p_nchars;
4238   int reg;                      /* graphic register number */
4239   int id = CHARSET_ID (charset);
4240
4241   /* At first, check designations.  */
4242   for (reg = 0; reg < 4; reg++)
4243     if (id == CODING_ISO_DESIGNATION (coding, reg))
4244       break;
4245
4246   if (reg >= 4)
4247     {
4248       /* CHARSET is not yet designated to any graphic registers.  */
4249       /* At first check the requested designation.  */
4250       reg = CODING_ISO_REQUEST (coding, id);
4251       if (reg < 0)
4252         /* Since CHARSET requests no special designation, designate it
4253            to graphic register 0.  */
4254         reg = 0;
4255
4256       ENCODE_DESIGNATION (charset, reg, coding);
4257     }
4258
4259   if (CODING_ISO_INVOCATION (coding, 0) != reg
4260       && CODING_ISO_INVOCATION (coding, 1) != reg)
4261     {
4262       /* Since the graphic register REG is not invoked to any graphic
4263          planes, invoke it to graphic plane 0.  */
4264       switch (reg)
4265         {
4266         case 0:                 /* graphic register 0 */
4267           ENCODE_SHIFT_IN;
4268           break;
4269
4270         case 1:                 /* graphic register 1 */
4271           ENCODE_SHIFT_OUT;
4272           break;
4273
4274         case 2:                 /* graphic register 2 */
4275           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4276             ENCODE_SINGLE_SHIFT_2;
4277           else
4278             ENCODE_LOCKING_SHIFT_2;
4279           break;
4280
4281         case 3:                 /* graphic register 3 */
4282           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4283             ENCODE_SINGLE_SHIFT_3;
4284           else
4285             ENCODE_LOCKING_SHIFT_3;
4286           break;
4287         }
4288     }
4289
4290   *p_nchars = produced_chars;
4291   return dst;
4292 }
4293
4294 /* The following three macros produce codes for indicating direction
4295    of text.  */
4296 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4297   do {                                                                  \
4298     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4299       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4300     else                                                                \
4301       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4302   } while (0)
4303
4304
4305 #define ENCODE_DIRECTION_R2L()                  \
4306   do {                                          \
4307     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4308     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4309   } while (0)
4310
4311
4312 #define ENCODE_DIRECTION_L2R()                  \
4313   do {                                          \
4314     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4315     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4316   } while (0)
4317
4318
4319 /* Produce codes for designation and invocation to reset the graphic
4320    planes and registers to initial state.  */
4321 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4322   do {                                                                  \
4323     int reg;                                                            \
4324     struct charset *charset;                                            \
4325                                                                         \
4326     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4327       ENCODE_SHIFT_IN;                                                  \
4328     for (reg = 0; reg < 4; reg++)                                       \
4329       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4330           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4331               != CODING_ISO_INITIAL (coding, reg)))                     \
4332         {                                                               \
4333           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4334           ENCODE_DESIGNATION (charset, reg, coding);                    \
4335         }                                                               \
4336   } while (0)
4337
4338
4339 /* Produce designation sequences of charsets in the line started from
4340    SRC to a place pointed by DST, and return updated DST.
4341
4342    If the current block ends before any end-of-line, we may fail to
4343    find all the necessary designations.  */
4344
4345 static unsigned char *
4346 encode_designation_at_bol (struct coding_system *coding, int *charbuf,
4347                            int *charbuf_end, unsigned char *dst)
4348 {
4349   struct charset *charset;
4350   /* Table of charsets to be designated to each graphic register.  */
4351   int r[4];
4352   int c, found = 0, reg;
4353   int produced_chars = 0;
4354   int multibytep = coding->dst_multibyte;
4355   Lisp_Object attrs;
4356   Lisp_Object charset_list;
4357
4358   attrs = CODING_ID_ATTRS (coding->id);
4359   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4360   if (EQ (charset_list, Qiso_2022))
4361     charset_list = Viso_2022_charset_list;
4362
4363   for (reg = 0; reg < 4; reg++)
4364     r[reg] = -1;
4365
4366   while (found < 4)
4367     {
4368       int id;
4369
4370       c = *charbuf++;
4371       if (c == '\n')
4372         break;
4373       charset = char_charset (c, charset_list, NULL);
4374       id = CHARSET_ID (charset);
4375       reg = CODING_ISO_REQUEST (coding, id);
4376       if (reg >= 0 && r[reg] < 0)
4377         {
4378           found++;
4379           r[reg] = id;
4380         }
4381     }
4382
4383   if (found)
4384     {
4385       for (reg = 0; reg < 4; reg++)
4386         if (r[reg] >= 0
4387             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4388           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4389     }
4390
4391   return dst;
4392 }
4393
4394 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4395
4396 static int
4397 encode_coding_iso_2022 (struct coding_system *coding)
4398 {
4399   int multibytep = coding->dst_multibyte;
4400   int *charbuf = coding->charbuf;
4401   int *charbuf_end = charbuf + coding->charbuf_used;
4402   unsigned char *dst = coding->destination + coding->produced;
4403   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4404   int safe_room = 16;
4405   int bol_designation
4406     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4407        && CODING_ISO_BOL (coding));
4408   int produced_chars = 0;
4409   Lisp_Object attrs, eol_type, charset_list;
4410   int ascii_compatible;
4411   int c;
4412   int preferred_charset_id = -1;
4413
4414   CODING_GET_INFO (coding, attrs, charset_list);
4415   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4416   if (VECTORP (eol_type))
4417     eol_type = Qunix;
4418
4419   setup_iso_safe_charsets (attrs);
4420   /* Charset list may have been changed.  */
4421   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4422   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4423
4424   ascii_compatible
4425     = (! NILP (CODING_ATTR_ASCII_COMPAT (attrs))
4426        && ! (CODING_ISO_FLAGS (coding) & (CODING_ISO_FLAG_DESIGNATION
4427                                           | CODING_ISO_FLAG_LOCKING_SHIFT)));
4428
4429   while (charbuf < charbuf_end)
4430     {
4431       ASSURE_DESTINATION (safe_room);
4432
4433       if (bol_designation)
4434         {
4435           unsigned char *dst_prev = dst;
4436
4437           /* We have to produce designation sequences if any now.  */
4438           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4439           bol_designation = 0;
4440           /* We are sure that designation sequences are all ASCII bytes.  */
4441           produced_chars += dst - dst_prev;
4442         }
4443
4444       c = *charbuf++;
4445
4446       if (c < 0)
4447         {
4448           /* Handle an annotation.  */
4449           switch (*charbuf)
4450             {
4451             case CODING_ANNOTATE_COMPOSITION_MASK:
4452               /* Not yet implemented.  */
4453               break;
4454             case CODING_ANNOTATE_CHARSET_MASK:
4455               preferred_charset_id = charbuf[2];
4456               if (preferred_charset_id >= 0
4457                   && NILP (Fmemq (make_number (preferred_charset_id),
4458                                   charset_list)))
4459                 preferred_charset_id = -1;
4460               break;
4461             default:
4462               abort ();
4463             }
4464           charbuf += -c - 1;
4465           continue;
4466         }
4467
4468       /* Now encode the character C.  */
4469       if (c < 0x20 || c == 0x7F)
4470         {
4471           if (c == '\n'
4472               || (c == '\r' && EQ (eol_type, Qmac)))
4473             {
4474               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4475                 ENCODE_RESET_PLANE_AND_REGISTER ();
4476               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4477                 {
4478                   int i;
4479
4480                   for (i = 0; i < 4; i++)
4481                     CODING_ISO_DESIGNATION (coding, i)
4482                       = CODING_ISO_INITIAL (coding, i);
4483                 }
4484               bol_designation
4485                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4486             }
4487           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4488             ENCODE_RESET_PLANE_AND_REGISTER ();
4489           EMIT_ONE_ASCII_BYTE (c);
4490         }
4491       else if (ASCII_CHAR_P (c))
4492         {
4493           if (ascii_compatible)
4494             EMIT_ONE_ASCII_BYTE (c);
4495           else
4496             {
4497               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4498               ENCODE_ISO_CHARACTER (charset, c);
4499             }
4500         }
4501       else if (CHAR_BYTE8_P (c))
4502         {
4503           c = CHAR_TO_BYTE8 (c);
4504           EMIT_ONE_BYTE (c);
4505         }
4506       else
4507         {
4508           struct charset *charset;
4509
4510           if (preferred_charset_id >= 0)
4511             {
4512               charset = CHARSET_FROM_ID (preferred_charset_id);
4513               if (! CHAR_CHARSET_P (c, charset))
4514                 charset = char_charset (c, charset_list, NULL);
4515             }
4516           else
4517             charset = char_charset (c, charset_list, NULL);
4518           if (!charset)
4519             {
4520               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4521                 {
4522                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4523                   charset = CHARSET_FROM_ID (charset_ascii);
4524                 }
4525               else
4526                 {
4527                   c = coding->default_char;
4528                   charset = char_charset (c, charset_list, NULL);
4529                 }
4530             }
4531           ENCODE_ISO_CHARACTER (charset, c);
4532         }
4533     }
4534
4535   if (coding->mode & CODING_MODE_LAST_BLOCK
4536       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4537     {
4538       ASSURE_DESTINATION (safe_room);
4539       ENCODE_RESET_PLANE_AND_REGISTER ();
4540     }
4541   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4542   CODING_ISO_BOL (coding) = bol_designation;
4543   coding->produced_char += produced_chars;
4544   coding->produced = dst - coding->destination;
4545   return 0;
4546 }
4547
4548 \f
4549 /*** 8,9. SJIS and BIG5 handlers ***/
4550
4551 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4552    quite widely.  So, for the moment, Emacs supports them in the bare
4553    C code.  But, in the future, they may be supported only by CCL.  */
4554
4555 /* SJIS is a coding system encoding three character sets: ASCII, right
4556    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4557    as is.  A character of charset katakana-jisx0201 is encoded by
4558    "position-code + 0x80".  A character of charset japanese-jisx0208
4559    is encoded in 2-byte but two position-codes are divided and shifted
4560    so that it fit in the range below.
4561
4562    --- CODE RANGE of SJIS ---
4563    (character set)      (range)
4564    ASCII                0x00 .. 0x7F
4565    KATAKANA-JISX0201    0xA0 .. 0xDF
4566    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4567             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4568    -------------------------------
4569
4570 */
4571
4572 /* BIG5 is a coding system encoding two character sets: ASCII and
4573    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4574    character set and is encoded in two-byte.
4575
4576    --- CODE RANGE of BIG5 ---
4577    (character set)      (range)
4578    ASCII                0x00 .. 0x7F
4579    Big5 (1st byte)      0xA1 .. 0xFE
4580         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4581    --------------------------
4582
4583   */
4584
4585 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4586    Check if a text is encoded in SJIS.  If it is, return
4587    CATEGORY_MASK_SJIS, else return 0.  */
4588
4589 static int
4590 detect_coding_sjis (struct coding_system *coding,
4591                     struct coding_detection_info *detect_info)
4592 {
4593   const unsigned char *src = coding->source, *src_base;
4594   const unsigned char *src_end = coding->source + coding->src_bytes;
4595   int multibytep = coding->src_multibyte;
4596   int consumed_chars = 0;
4597   int found = 0;
4598   int c;
4599   Lisp_Object attrs, charset_list;
4600   int max_first_byte_of_2_byte_code;
4601
4602   CODING_GET_INFO (coding, attrs, charset_list);
4603   max_first_byte_of_2_byte_code
4604     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4605
4606   detect_info->checked |= CATEGORY_MASK_SJIS;
4607   /* A coding system of this category is always ASCII compatible.  */
4608   src += coding->head_ascii;
4609
4610   while (1)
4611     {
4612       src_base = src;
4613       ONE_MORE_BYTE (c);
4614       if (c < 0x80)
4615         continue;
4616       if ((c >= 0x81 && c <= 0x9F)
4617           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4618         {
4619           ONE_MORE_BYTE (c);
4620           if (c < 0x40 || c == 0x7F || c > 0xFC)
4621             break;
4622           found = CATEGORY_MASK_SJIS;
4623         }
4624       else if (c >= 0xA0 && c < 0xE0)
4625         found = CATEGORY_MASK_SJIS;
4626       else
4627         break;
4628     }
4629   detect_info->rejected |= CATEGORY_MASK_SJIS;
4630   return 0;
4631
4632  no_more_source:
4633   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4634     {
4635       detect_info->rejected |= CATEGORY_MASK_SJIS;
4636       return 0;
4637     }
4638   detect_info->found |= found;
4639   return 1;
4640 }
4641
4642 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4643    Check if a text is encoded in BIG5.  If it is, return
4644    CATEGORY_MASK_BIG5, else return 0.  */
4645
4646 static int
4647 detect_coding_big5 (struct coding_system *coding,
4648                     struct coding_detection_info *detect_info)
4649 {
4650   const unsigned char *src = coding->source, *src_base;
4651   const unsigned char *src_end = coding->source + coding->src_bytes;
4652   int multibytep = coding->src_multibyte;
4653   int consumed_chars = 0;
4654   int found = 0;
4655   int c;
4656
4657   detect_info->checked |= CATEGORY_MASK_BIG5;
4658   /* A coding system of this category is always ASCII compatible.  */
4659   src += coding->head_ascii;
4660
4661   while (1)
4662     {
4663       src_base = src;
4664       ONE_MORE_BYTE (c);
4665       if (c < 0x80)
4666         continue;
4667       if (c >= 0xA1)
4668         {
4669           ONE_MORE_BYTE (c);
4670           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4671             return 0;
4672           found = CATEGORY_MASK_BIG5;
4673         }
4674       else
4675         break;
4676     }
4677   detect_info->rejected |= CATEGORY_MASK_BIG5;
4678   return 0;
4679
4680  no_more_source:
4681   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4682     {
4683       detect_info->rejected |= CATEGORY_MASK_BIG5;
4684       return 0;
4685     }
4686   detect_info->found |= found;
4687   return 1;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4691    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4692
4693 static void
4694 decode_coding_sjis (struct coding_system *coding)
4695 {
4696   const unsigned char *src = coding->source + coding->consumed;
4697   const unsigned char *src_end = coding->source + coding->src_bytes;
4698   const unsigned char *src_base;
4699   int *charbuf = coding->charbuf + coding->charbuf_used;
4700   /* We may produce one charset annotation in one loop and one more at
4701      the end.  */
4702   int *charbuf_end
4703     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4704   int consumed_chars = 0, consumed_chars_base;
4705   int multibytep = coding->src_multibyte;
4706   struct charset *charset_roman, *charset_kanji, *charset_kana;
4707   struct charset *charset_kanji2;
4708   Lisp_Object attrs, charset_list, val;
4709   int char_offset = coding->produced_char;
4710   int last_offset = char_offset;
4711   int last_id = charset_ascii;
4712   int eol_crlf =
4713     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4714   int byte_after_cr = -1;
4715
4716   CODING_GET_INFO (coding, attrs, charset_list);
4717
4718   val = charset_list;
4719   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4720   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4721   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4722   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4723
4724   while (1)
4725     {
4726       int c, c1;
4727       struct charset *charset;
4728
4729       src_base = src;
4730       consumed_chars_base = consumed_chars;
4731
4732       if (charbuf >= charbuf_end)
4733         {
4734           if (byte_after_cr >= 0)
4735             src_base--;
4736           break;
4737         }
4738
4739       if (byte_after_cr >= 0)
4740         c = byte_after_cr, byte_after_cr = -1;
4741       else
4742         ONE_MORE_BYTE (c);
4743       if (c < 0)
4744         goto invalid_code;
4745       if (c < 0x80)
4746         {
4747           if (eol_crlf && c == '\r')
4748             ONE_MORE_BYTE (byte_after_cr);
4749           charset = charset_roman;
4750         }
4751       else if (c == 0x80 || c == 0xA0)
4752         goto invalid_code;
4753       else if (c >= 0xA1 && c <= 0xDF)
4754         {
4755           /* SJIS -> JISX0201-Kana */
4756           c &= 0x7F;
4757           charset = charset_kana;
4758         }
4759       else if (c <= 0xEF)
4760         {
4761           /* SJIS -> JISX0208 */
4762           ONE_MORE_BYTE (c1);
4763           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4764             goto invalid_code;
4765           c = (c << 8) | c1;
4766           SJIS_TO_JIS (c);
4767           charset = charset_kanji;
4768         }
4769       else if (c <= 0xFC && charset_kanji2)
4770         {
4771           /* SJIS -> JISX0213-2 */
4772           ONE_MORE_BYTE (c1);
4773           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4774             goto invalid_code;
4775           c = (c << 8) | c1;
4776           SJIS_TO_JIS2 (c);
4777           charset = charset_kanji2;
4778         }
4779       else
4780         goto invalid_code;
4781       if (charset->id != charset_ascii
4782           && last_id != charset->id)
4783         {
4784           if (last_id != charset_ascii)
4785             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4786           last_id = charset->id;
4787           last_offset = char_offset;
4788         }
4789       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4790       *charbuf++ = c;
4791       char_offset++;
4792       continue;
4793
4794     invalid_code:
4795       src = src_base;
4796       consumed_chars = consumed_chars_base;
4797       ONE_MORE_BYTE (c);
4798       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4799       char_offset++;
4800       coding->errors++;
4801     }
4802
4803  no_more_source:
4804   if (last_id != charset_ascii)
4805     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4806   coding->consumed_char += consumed_chars_base;
4807   coding->consumed = src_base - coding->source;
4808   coding->charbuf_used = charbuf - coding->charbuf;
4809 }
4810
4811 static void
4812 decode_coding_big5 (struct coding_system *coding)
4813 {
4814   const unsigned char *src = coding->source + coding->consumed;
4815   const unsigned char *src_end = coding->source + coding->src_bytes;
4816   const unsigned char *src_base;
4817   int *charbuf = coding->charbuf + coding->charbuf_used;
4818   /* We may produce one charset annotation in one loop and one more at
4819      the end.  */
4820   int *charbuf_end
4821     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4822   int consumed_chars = 0, consumed_chars_base;
4823   int multibytep = coding->src_multibyte;
4824   struct charset *charset_roman, *charset_big5;
4825   Lisp_Object attrs, charset_list, val;
4826   int char_offset = coding->produced_char;
4827   int last_offset = char_offset;
4828   int last_id = charset_ascii;
4829   int eol_crlf =
4830     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4831   int byte_after_cr = -1;
4832
4833   CODING_GET_INFO (coding, attrs, charset_list);
4834   val = charset_list;
4835   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4836   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4837
4838   while (1)
4839     {
4840       int c, c1;
4841       struct charset *charset;
4842
4843       src_base = src;
4844       consumed_chars_base = consumed_chars;
4845
4846       if (charbuf >= charbuf_end)
4847         {
4848           if (byte_after_cr >= 0)
4849             src_base--;
4850           break;
4851         }
4852
4853       if (byte_after_cr >= 0)
4854         c = byte_after_cr, byte_after_cr = -1;
4855       else
4856         ONE_MORE_BYTE (c);
4857
4858       if (c < 0)
4859         goto invalid_code;
4860       if (c < 0x80)
4861         {
4862           if (eol_crlf && c == '\r')
4863             ONE_MORE_BYTE (byte_after_cr);
4864           charset = charset_roman;
4865         }
4866       else
4867         {
4868           /* BIG5 -> Big5 */
4869           if (c < 0xA1 || c > 0xFE)
4870             goto invalid_code;
4871           ONE_MORE_BYTE (c1);
4872           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4873             goto invalid_code;
4874           c = c << 8 | c1;
4875           charset = charset_big5;
4876         }
4877       if (charset->id != charset_ascii
4878           && last_id != charset->id)
4879         {
4880           if (last_id != charset_ascii)
4881             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4882           last_id = charset->id;
4883           last_offset = char_offset;
4884         }
4885       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4886       *charbuf++ = c;
4887       char_offset++;
4888       continue;
4889
4890     invalid_code:
4891       src = src_base;
4892       consumed_chars = consumed_chars_base;
4893       ONE_MORE_BYTE (c);
4894       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4895       char_offset++;
4896       coding->errors++;
4897     }
4898
4899  no_more_source:
4900   if (last_id != charset_ascii)
4901     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4902   coding->consumed_char += consumed_chars_base;
4903   coding->consumed = src_base - coding->source;
4904   coding->charbuf_used = charbuf - coding->charbuf;
4905 }
4906
4907 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4908    This function can encode charsets `ascii', `katakana-jisx0201',
4909    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4910    are sure that all these charsets are registered as official charset
4911    (i.e. do not have extended leading-codes).  Characters of other
4912    charsets are produced without any encoding.  If SJIS_P is 1, encode
4913    SJIS text, else encode BIG5 text.  */
4914
4915 static int
4916 encode_coding_sjis (struct coding_system *coding)
4917 {
4918   int multibytep = coding->dst_multibyte;
4919   int *charbuf = coding->charbuf;
4920   int *charbuf_end = charbuf + coding->charbuf_used;
4921   unsigned char *dst = coding->destination + coding->produced;
4922   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4923   int safe_room = 4;
4924   int produced_chars = 0;
4925   Lisp_Object attrs, charset_list, val;
4926   int ascii_compatible;
4927   struct charset *charset_roman, *charset_kanji, *charset_kana;
4928   struct charset *charset_kanji2;
4929   int c;
4930
4931   CODING_GET_INFO (coding, attrs, charset_list);
4932   val = charset_list;
4933   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4934   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4935   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4936   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4937
4938   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4939
4940   while (charbuf < charbuf_end)
4941     {
4942       ASSURE_DESTINATION (safe_room);
4943       c = *charbuf++;
4944       /* Now encode the character C.  */
4945       if (ASCII_CHAR_P (c) && ascii_compatible)
4946         EMIT_ONE_ASCII_BYTE (c);
4947       else if (CHAR_BYTE8_P (c))
4948         {
4949           c = CHAR_TO_BYTE8 (c);
4950           EMIT_ONE_BYTE (c);
4951         }
4952       else
4953         {
4954           unsigned code;
4955           struct charset *charset = char_charset (c, charset_list, &code);
4956
4957           if (!charset)
4958             {
4959               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4960                 {
4961                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4962                   charset = CHARSET_FROM_ID (charset_ascii);
4963                 }
4964               else
4965                 {
4966                   c = coding->default_char;
4967                   charset = char_charset (c, charset_list, &code);
4968                 }
4969             }
4970           if (code == CHARSET_INVALID_CODE (charset))
4971             abort ();
4972           if (charset == charset_kanji)
4973             {
4974               int c1, c2;
4975               JIS_TO_SJIS (code);
4976               c1 = code >> 8, c2 = code & 0xFF;
4977               EMIT_TWO_BYTES (c1, c2);
4978             }
4979           else if (charset == charset_kana)
4980             EMIT_ONE_BYTE (code | 0x80);
4981           else if (charset_kanji2 && charset == charset_kanji2)
4982             {
4983               int c1, c2;
4984
4985               c1 = code >> 8;
4986               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
4987                   || c1 == 0x28
4988                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4989                 {
4990                   JIS_TO_SJIS2 (code);
4991                   c1 = code >> 8, c2 = code & 0xFF;
4992                   EMIT_TWO_BYTES (c1, c2);
4993                 }
4994               else
4995                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4996             }
4997           else
4998             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4999         }
5000     }
5001   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5002   coding->produced_char += produced_chars;
5003   coding->produced = dst - coding->destination;
5004   return 0;
5005 }
5006
5007 static int
5008 encode_coding_big5 (struct coding_system *coding)
5009 {
5010   int multibytep = coding->dst_multibyte;
5011   int *charbuf = coding->charbuf;
5012   int *charbuf_end = charbuf + coding->charbuf_used;
5013   unsigned char *dst = coding->destination + coding->produced;
5014   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5015   int safe_room = 4;
5016   int produced_chars = 0;
5017   Lisp_Object attrs, charset_list, val;
5018   int ascii_compatible;
5019   struct charset *charset_roman, *charset_big5;
5020   int c;
5021
5022   CODING_GET_INFO (coding, attrs, charset_list);
5023   val = charset_list;
5024   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5025   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5026   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5027
5028   while (charbuf < charbuf_end)
5029     {
5030       ASSURE_DESTINATION (safe_room);
5031       c = *charbuf++;
5032       /* Now encode the character C.  */
5033       if (ASCII_CHAR_P (c) && ascii_compatible)
5034         EMIT_ONE_ASCII_BYTE (c);
5035       else if (CHAR_BYTE8_P (c))
5036         {
5037           c = CHAR_TO_BYTE8 (c);
5038           EMIT_ONE_BYTE (c);
5039         }
5040       else
5041         {
5042           unsigned code;
5043           struct charset *charset = char_charset (c, charset_list, &code);
5044
5045           if (! charset)
5046             {
5047               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5048                 {
5049                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5050                   charset = CHARSET_FROM_ID (charset_ascii);
5051                 }
5052               else
5053                 {
5054                   c = coding->default_char;
5055                   charset = char_charset (c, charset_list, &code);
5056                 }
5057             }
5058           if (code == CHARSET_INVALID_CODE (charset))
5059             abort ();
5060           if (charset == charset_big5)
5061             {
5062               int c1, c2;
5063
5064               c1 = code >> 8, c2 = code & 0xFF;
5065               EMIT_TWO_BYTES (c1, c2);
5066             }
5067           else
5068             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5069         }
5070     }
5071   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5072   coding->produced_char += produced_chars;
5073   coding->produced = dst - coding->destination;
5074   return 0;
5075 }
5076
5077 \f
5078 /*** 10. CCL handlers ***/
5079
5080 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5081    Check if a text is encoded in a coding system of which
5082    encoder/decoder are written in CCL program.  If it is, return
5083    CATEGORY_MASK_CCL, else return 0.  */
5084
5085 static int
5086 detect_coding_ccl (struct coding_system *coding,
5087                    struct coding_detection_info *detect_info)
5088 {
5089   const unsigned char *src = coding->source, *src_base;
5090   const unsigned char *src_end = coding->source + coding->src_bytes;
5091   int multibytep = coding->src_multibyte;
5092   int consumed_chars = 0;
5093   int found = 0;
5094   unsigned char *valids;
5095   int head_ascii = coding->head_ascii;
5096   Lisp_Object attrs;
5097
5098   detect_info->checked |= CATEGORY_MASK_CCL;
5099
5100   coding = &coding_categories[coding_category_ccl];
5101   valids = CODING_CCL_VALIDS (coding);
5102   attrs = CODING_ID_ATTRS (coding->id);
5103   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5104     src += head_ascii;
5105
5106   while (1)
5107     {
5108       int c;
5109
5110       src_base = src;
5111       ONE_MORE_BYTE (c);
5112       if (c < 0 || ! valids[c])
5113         break;
5114       if ((valids[c] > 1))
5115         found = CATEGORY_MASK_CCL;
5116     }
5117   detect_info->rejected |= CATEGORY_MASK_CCL;
5118   return 0;
5119
5120  no_more_source:
5121   detect_info->found |= found;
5122   return 1;
5123 }
5124
5125 static void
5126 decode_coding_ccl (struct coding_system *coding)
5127 {
5128   const unsigned char *src = coding->source + coding->consumed;
5129   const unsigned char *src_end = coding->source + coding->src_bytes;
5130   int *charbuf = coding->charbuf + coding->charbuf_used;
5131   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5132   int consumed_chars = 0;
5133   int multibytep = coding->src_multibyte;
5134   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5135   int source_charbuf[1024];
5136   int source_byteidx[1025];
5137   Lisp_Object attrs, charset_list;
5138
5139   CODING_GET_INFO (coding, attrs, charset_list);
5140
5141   while (1)
5142     {
5143       const unsigned char *p = src;
5144       int i = 0;
5145
5146       if (multibytep)
5147         {
5148           while (i < 1024 && p < src_end)
5149             {
5150               source_byteidx[i] = p - src;
5151               source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5152             }
5153           source_byteidx[i] = p - src;
5154         }
5155       else
5156         while (i < 1024 && p < src_end)
5157           source_charbuf[i++] = *p++;
5158
5159       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5160         ccl->last_block = 1;
5161       ccl_driver (ccl, source_charbuf, charbuf, i, charbuf_end - charbuf,
5162                   charset_list);
5163       charbuf += ccl->produced;
5164       if (multibytep)
5165         src += source_byteidx[ccl->consumed];
5166       else
5167         src += ccl->consumed;
5168       consumed_chars += ccl->consumed;
5169       if (p == src_end || ccl->status != CCL_STAT_SUSPEND_BY_SRC)
5170         break;
5171     }
5172
5173   switch (ccl->status)
5174     {
5175     case CCL_STAT_SUSPEND_BY_SRC:
5176       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5177       break;
5178     case CCL_STAT_SUSPEND_BY_DST:
5179       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5180       break;
5181     case CCL_STAT_QUIT:
5182     case CCL_STAT_INVALID_CMD:
5183       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5184       break;
5185     default:
5186       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5187       break;
5188     }
5189   coding->consumed_char += consumed_chars;
5190   coding->consumed = src - coding->source;
5191   coding->charbuf_used = charbuf - coding->charbuf;
5192 }
5193
5194 static int
5195 encode_coding_ccl (struct coding_system *coding)
5196 {
5197   struct ccl_program *ccl = &coding->spec.ccl->ccl;
5198   int multibytep = coding->dst_multibyte;
5199   int *charbuf = coding->charbuf;
5200   int *charbuf_end = charbuf + coding->charbuf_used;
5201   unsigned char *dst = coding->destination + coding->produced;
5202   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5203   int destination_charbuf[1024];
5204   int i, produced_chars = 0;
5205   Lisp_Object attrs, charset_list;
5206
5207   CODING_GET_INFO (coding, attrs, charset_list);
5208   if (coding->consumed_char == coding->src_chars
5209       && coding->mode & CODING_MODE_LAST_BLOCK)
5210     ccl->last_block = 1;
5211
5212   while (charbuf < charbuf_end)
5213     {
5214       ccl_driver (ccl, charbuf, destination_charbuf,
5215                   charbuf_end - charbuf, 1024, charset_list);
5216       if (multibytep)
5217         {
5218           ASSURE_DESTINATION (ccl->produced * 2);
5219           for (i = 0; i < ccl->produced; i++)
5220             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5221         }
5222       else
5223         {
5224           ASSURE_DESTINATION (ccl->produced);
5225           for (i = 0; i < ccl->produced; i++)
5226             *dst++ = destination_charbuf[i] & 0xFF;
5227           produced_chars += ccl->produced;
5228         }
5229       charbuf += ccl->consumed;
5230       if (ccl->status == CCL_STAT_QUIT
5231           || ccl->status == CCL_STAT_INVALID_CMD)
5232         break;
5233     }
5234
5235   switch (ccl->status)
5236     {
5237     case CCL_STAT_SUSPEND_BY_SRC:
5238       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5239       break;
5240     case CCL_STAT_SUSPEND_BY_DST:
5241       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5242       break;
5243     case CCL_STAT_QUIT:
5244     case CCL_STAT_INVALID_CMD:
5245       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5246       break;
5247     default:
5248       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5249       break;
5250     }
5251
5252   coding->produced_char += produced_chars;
5253   coding->produced = dst - coding->destination;
5254   return 0;
5255 }
5256
5257
5258 \f
5259 /*** 10, 11. no-conversion handlers ***/
5260
5261 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5262
5263 static void
5264 decode_coding_raw_text (struct coding_system *coding)
5265 {
5266   int eol_crlf =
5267     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5268
5269   coding->chars_at_source = 1;
5270   coding->consumed_char = coding->src_chars;
5271   coding->consumed = coding->src_bytes;
5272   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5273     {
5274       coding->consumed_char--;
5275       coding->consumed--;
5276       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5277     }
5278   else
5279     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5280 }
5281
5282 static int
5283 encode_coding_raw_text (struct coding_system *coding)
5284 {
5285   int multibytep = coding->dst_multibyte;
5286   int *charbuf = coding->charbuf;
5287   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5288   unsigned char *dst = coding->destination + coding->produced;
5289   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5290   int produced_chars = 0;
5291   int c;
5292
5293   if (multibytep)
5294     {
5295       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5296
5297       if (coding->src_multibyte)
5298         while (charbuf < charbuf_end)
5299           {
5300             ASSURE_DESTINATION (safe_room);
5301             c = *charbuf++;
5302             if (ASCII_CHAR_P (c))
5303               EMIT_ONE_ASCII_BYTE (c);
5304             else if (CHAR_BYTE8_P (c))
5305               {
5306                 c = CHAR_TO_BYTE8 (c);
5307                 EMIT_ONE_BYTE (c);
5308               }
5309             else
5310               {
5311                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5312
5313                 CHAR_STRING_ADVANCE (c, p1);
5314                 while (p0 < p1)
5315                   {
5316                     EMIT_ONE_BYTE (*p0);
5317                     p0++;
5318                   }
5319               }
5320           }
5321       else
5322         while (charbuf < charbuf_end)
5323           {
5324             ASSURE_DESTINATION (safe_room);
5325             c = *charbuf++;
5326             EMIT_ONE_BYTE (c);
5327           }
5328     }
5329   else
5330     {
5331       if (coding->src_multibyte)
5332         {
5333           int safe_room = MAX_MULTIBYTE_LENGTH;
5334
5335           while (charbuf < charbuf_end)
5336             {
5337               ASSURE_DESTINATION (safe_room);
5338               c = *charbuf++;
5339               if (ASCII_CHAR_P (c))
5340                 *dst++ = c;
5341               else if (CHAR_BYTE8_P (c))
5342                 *dst++ = CHAR_TO_BYTE8 (c);
5343               else
5344                 CHAR_STRING_ADVANCE (c, dst);
5345             }
5346         }
5347       else
5348         {
5349           ASSURE_DESTINATION (charbuf_end - charbuf);
5350           while (charbuf < charbuf_end && dst < dst_end)
5351             *dst++ = *charbuf++;
5352         }
5353       produced_chars = dst - (coding->destination + coding->produced);
5354     }
5355   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5356   coding->produced_char += produced_chars;
5357   coding->produced = dst - coding->destination;
5358   return 0;
5359 }
5360
5361 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5362    Check if a text is encoded in a charset-based coding system.  If it
5363    is, return 1, else return 0.  */
5364
5365 static int
5366 detect_coding_charset (struct coding_system *coding,
5367                        struct coding_detection_info *detect_info)
5368 {
5369   const unsigned char *src = coding->source, *src_base;
5370   const unsigned char *src_end = coding->source + coding->src_bytes;
5371   int multibytep = coding->src_multibyte;
5372   int consumed_chars = 0;
5373   Lisp_Object attrs, valids, name;
5374   int found = 0;
5375   int head_ascii = coding->head_ascii;
5376   int check_latin_extra = 0;
5377
5378   detect_info->checked |= CATEGORY_MASK_CHARSET;
5379
5380   coding = &coding_categories[coding_category_charset];
5381   attrs = CODING_ID_ATTRS (coding->id);
5382   valids = AREF (attrs, coding_attr_charset_valids);
5383   name = CODING_ID_NAME (coding->id);
5384   if (strncmp (SSDATA (SYMBOL_NAME (name)),
5385                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5386       || strncmp (SSDATA (SYMBOL_NAME (name)),
5387                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5388     check_latin_extra = 1;
5389
5390   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5391     src += head_ascii;
5392
5393   while (1)
5394     {
5395       int c;
5396       Lisp_Object val;
5397       struct charset *charset;
5398       int dim, idx;
5399
5400       src_base = src;
5401       ONE_MORE_BYTE (c);
5402       if (c < 0)
5403         continue;
5404       val = AREF (valids, c);
5405       if (NILP (val))
5406         break;
5407       if (c >= 0x80)
5408         {
5409           if (c < 0xA0
5410               && check_latin_extra
5411               && (!VECTORP (Vlatin_extra_code_table)
5412                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5413             break;
5414           found = CATEGORY_MASK_CHARSET;
5415         }
5416       if (INTEGERP (val))
5417         {
5418           charset = CHARSET_FROM_ID (XFASTINT (val));
5419           dim = CHARSET_DIMENSION (charset);
5420           for (idx = 1; idx < dim; idx++)
5421             {
5422               if (src == src_end)
5423                 goto too_short;
5424               ONE_MORE_BYTE (c);
5425               if (c < charset->code_space[(dim - 1 - idx) * 2]
5426                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5427                 break;
5428             }
5429           if (idx < dim)
5430             break;
5431         }
5432       else
5433         {
5434           idx = 1;
5435           for (; CONSP (val); val = XCDR (val))
5436             {
5437               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5438               dim = CHARSET_DIMENSION (charset);
5439               while (idx < dim)
5440                 {
5441                   if (src == src_end)
5442                     goto too_short;
5443                   ONE_MORE_BYTE (c);
5444                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5445                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5446                     break;
5447                   idx++;
5448                 }
5449               if (idx == dim)
5450                 {
5451                   val = Qnil;
5452                   break;
5453                 }
5454             }
5455           if (CONSP (val))
5456             break;
5457         }
5458     }
5459  too_short:
5460   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5461   return 0;
5462
5463  no_more_source:
5464   detect_info->found |= found;
5465   return 1;
5466 }
5467
5468 static void
5469 decode_coding_charset (struct coding_system *coding)
5470 {
5471   const unsigned char *src = coding->source + coding->consumed;
5472   const unsigned char *src_end = coding->source + coding->src_bytes;
5473   const unsigned char *src_base;
5474   int *charbuf = coding->charbuf + coding->charbuf_used;
5475   /* We may produce one charset annotation in one loop and one more at
5476      the end.  */
5477   int *charbuf_end
5478     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5479   int consumed_chars = 0, consumed_chars_base;
5480   int multibytep = coding->src_multibyte;
5481   Lisp_Object attrs, charset_list, valids;
5482   int char_offset = coding->produced_char;
5483   int last_offset = char_offset;
5484   int last_id = charset_ascii;
5485   int eol_crlf =
5486     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5487   int byte_after_cr = -1;
5488
5489   CODING_GET_INFO (coding, attrs, charset_list);
5490   valids = AREF (attrs, coding_attr_charset_valids);
5491
5492   while (1)
5493     {
5494       int c;
5495       Lisp_Object val;
5496       struct charset *charset;
5497       int dim;
5498       int len = 1;
5499       unsigned code;
5500
5501       src_base = src;
5502       consumed_chars_base = consumed_chars;
5503
5504       if (charbuf >= charbuf_end)
5505         {
5506           if (byte_after_cr >= 0)
5507             src_base--;
5508           break;
5509         }
5510
5511       if (byte_after_cr >= 0)
5512         {
5513           c = byte_after_cr;
5514           byte_after_cr = -1;
5515         }
5516       else
5517         {
5518           ONE_MORE_BYTE (c);
5519           if (eol_crlf && c == '\r')
5520             ONE_MORE_BYTE (byte_after_cr);
5521         }
5522       if (c < 0)
5523         goto invalid_code;
5524       code = c;
5525
5526       val = AREF (valids, c);
5527       if (! INTEGERP (val) && ! CONSP (val))
5528         goto invalid_code;
5529       if (INTEGERP (val))
5530         {
5531           charset = CHARSET_FROM_ID (XFASTINT (val));
5532           dim = CHARSET_DIMENSION (charset);
5533           while (len < dim)
5534             {
5535               ONE_MORE_BYTE (c);
5536               code = (code << 8) | c;
5537               len++;
5538             }
5539           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5540                               charset, code, c);
5541         }
5542       else
5543         {
5544           /* VAL is a list of charset IDs.  It is assured that the
5545              list is sorted by charset dimensions (smaller one
5546              comes first).  */
5547           while (CONSP (val))
5548             {
5549               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5550               dim = CHARSET_DIMENSION (charset);
5551               while (len < dim)
5552                 {
5553                   ONE_MORE_BYTE (c);
5554                   code = (code << 8) | c;
5555                   len++;
5556                 }
5557               CODING_DECODE_CHAR (coding, src, src_base,
5558                                   src_end, charset, code, c);
5559               if (c >= 0)
5560                 break;
5561               val = XCDR (val);
5562             }
5563         }
5564       if (c < 0)
5565         goto invalid_code;
5566       if (charset->id != charset_ascii
5567           && last_id != charset->id)
5568         {
5569           if (last_id != charset_ascii)
5570             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5571           last_id = charset->id;
5572           last_offset = char_offset;
5573         }
5574
5575       *charbuf++ = c;
5576       char_offset++;
5577       continue;
5578
5579     invalid_code:
5580       src = src_base;
5581       consumed_chars = consumed_chars_base;
5582       ONE_MORE_BYTE (c);
5583       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5584       char_offset++;
5585       coding->errors++;
5586     }
5587
5588  no_more_source:
5589   if (last_id != charset_ascii)
5590     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5591   coding->consumed_char += consumed_chars_base;
5592   coding->consumed = src_base - coding->source;
5593   coding->charbuf_used = charbuf - coding->charbuf;
5594 }
5595
5596 static int
5597 encode_coding_charset (struct coding_system *coding)
5598 {
5599   int multibytep = coding->dst_multibyte;
5600   int *charbuf = coding->charbuf;
5601   int *charbuf_end = charbuf + coding->charbuf_used;
5602   unsigned char *dst = coding->destination + coding->produced;
5603   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5604   int safe_room = MAX_MULTIBYTE_LENGTH;
5605   int produced_chars = 0;
5606   Lisp_Object attrs, charset_list;
5607   int ascii_compatible;
5608   int c;
5609
5610   CODING_GET_INFO (coding, attrs, charset_list);
5611   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5612
5613   while (charbuf < charbuf_end)
5614     {
5615       struct charset *charset;
5616       unsigned code;
5617
5618       ASSURE_DESTINATION (safe_room);
5619       c = *charbuf++;
5620       if (ascii_compatible && ASCII_CHAR_P (c))
5621         EMIT_ONE_ASCII_BYTE (c);
5622       else if (CHAR_BYTE8_P (c))
5623         {
5624           c = CHAR_TO_BYTE8 (c);
5625           EMIT_ONE_BYTE (c);
5626         }
5627       else
5628         {
5629           charset = char_charset (c, charset_list, &code);
5630           if (charset)
5631             {
5632               if (CHARSET_DIMENSION (charset) == 1)
5633                 EMIT_ONE_BYTE (code);
5634               else if (CHARSET_DIMENSION (charset) == 2)
5635                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5636               else if (CHARSET_DIMENSION (charset) == 3)
5637                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5638               else
5639                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5640                                  (code >> 8) & 0xFF, code & 0xFF);
5641             }
5642           else
5643             {
5644               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5645                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5646               else
5647                 c = coding->default_char;
5648               EMIT_ONE_BYTE (c);
5649             }
5650         }
5651     }
5652
5653   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5654   coding->produced_char += produced_chars;
5655   coding->produced = dst - coding->destination;
5656   return 0;
5657 }
5658
5659 \f
5660 /*** 7. C library functions ***/
5661
5662 /* Setup coding context CODING from information about CODING_SYSTEM.
5663    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5664    CODING_SYSTEM is invalid, signal an error.  */
5665
5666 void
5667 setup_coding_system (Lisp_Object coding_system, struct coding_system *coding)
5668 {
5669   Lisp_Object attrs;
5670   Lisp_Object eol_type;
5671   Lisp_Object coding_type;
5672   Lisp_Object val;
5673
5674   if (NILP (coding_system))
5675     coding_system = Qundecided;
5676
5677   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5678
5679   attrs = CODING_ID_ATTRS (coding->id);
5680   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5681
5682   coding->mode = 0;
5683   coding->head_ascii = -1;
5684   if (VECTORP (eol_type))
5685     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5686                             | CODING_REQUIRE_DETECTION_MASK);
5687   else if (! EQ (eol_type, Qunix))
5688     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5689                             | CODING_REQUIRE_ENCODING_MASK);
5690   else
5691     coding->common_flags = 0;
5692   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5693     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5694   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5695     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5696   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5697     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5698
5699   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5700   coding->max_charset_id = SCHARS (val) - 1;
5701   coding->safe_charsets = SDATA (val);
5702   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5703   coding->carryover_bytes = 0;
5704
5705   coding_type = CODING_ATTR_TYPE (attrs);
5706   if (EQ (coding_type, Qundecided))
5707     {
5708       coding->detector = NULL;
5709       coding->decoder = decode_coding_raw_text;
5710       coding->encoder = encode_coding_raw_text;
5711       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5712     }
5713   else if (EQ (coding_type, Qiso_2022))
5714     {
5715       int i;
5716       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5717
5718       /* Invoke graphic register 0 to plane 0.  */
5719       CODING_ISO_INVOCATION (coding, 0) = 0;
5720       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5721       CODING_ISO_INVOCATION (coding, 1)
5722         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5723       /* Setup the initial status of designation.  */
5724       for (i = 0; i < 4; i++)
5725         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5726       /* Not single shifting initially.  */
5727       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5728       /* Beginning of buffer should also be regarded as bol. */
5729       CODING_ISO_BOL (coding) = 1;
5730       coding->detector = detect_coding_iso_2022;
5731       coding->decoder = decode_coding_iso_2022;
5732       coding->encoder = encode_coding_iso_2022;
5733       if (flags & CODING_ISO_FLAG_SAFE)
5734         coding->mode |= CODING_MODE_SAFE_ENCODING;
5735       coding->common_flags
5736         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5737             | CODING_REQUIRE_FLUSHING_MASK);
5738       if (flags & CODING_ISO_FLAG_COMPOSITION)
5739         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5740       if (flags & CODING_ISO_FLAG_DESIGNATION)
5741         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5742       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5743         {
5744           setup_iso_safe_charsets (attrs);
5745           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5746           coding->max_charset_id = SCHARS (val) - 1;
5747           coding->safe_charsets = SDATA (val);
5748         }
5749       CODING_ISO_FLAGS (coding) = flags;
5750       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5751       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5752       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5753       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5754     }
5755   else if (EQ (coding_type, Qcharset))
5756     {
5757       coding->detector = detect_coding_charset;
5758       coding->decoder = decode_coding_charset;
5759       coding->encoder = encode_coding_charset;
5760       coding->common_flags
5761         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5762     }
5763   else if (EQ (coding_type, Qutf_8))
5764     {
5765       val = AREF (attrs, coding_attr_utf_bom);
5766       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5767                                    : EQ (val, Qt) ? utf_with_bom
5768                                    : utf_without_bom);
5769       coding->detector = detect_coding_utf_8;
5770       coding->decoder = decode_coding_utf_8;
5771       coding->encoder = encode_coding_utf_8;
5772       coding->common_flags
5773         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5774       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5775         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5776     }
5777   else if (EQ (coding_type, Qutf_16))
5778     {
5779       val = AREF (attrs, coding_attr_utf_bom);
5780       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5781                                     : EQ (val, Qt) ? utf_with_bom
5782                                     : utf_without_bom);
5783       val = AREF (attrs, coding_attr_utf_16_endian);
5784       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5785                                        : utf_16_little_endian);
5786       CODING_UTF_16_SURROGATE (coding) = 0;
5787       coding->detector = detect_coding_utf_16;
5788       coding->decoder = decode_coding_utf_16;
5789       coding->encoder = encode_coding_utf_16;
5790       coding->common_flags
5791         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5792       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5793         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5794     }
5795   else if (EQ (coding_type, Qccl))
5796     {
5797       coding->detector = detect_coding_ccl;
5798       coding->decoder = decode_coding_ccl;
5799       coding->encoder = encode_coding_ccl;
5800       coding->common_flags
5801         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5802             | CODING_REQUIRE_FLUSHING_MASK);
5803     }
5804   else if (EQ (coding_type, Qemacs_mule))
5805     {
5806       coding->detector = detect_coding_emacs_mule;
5807       coding->decoder = decode_coding_emacs_mule;
5808       coding->encoder = encode_coding_emacs_mule;
5809       coding->common_flags
5810         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5811       coding->spec.emacs_mule.full_support = 1;
5812       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5813           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5814         {
5815           Lisp_Object tail, safe_charsets;
5816           int max_charset_id = 0;
5817
5818           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5819                tail = XCDR (tail))
5820             if (max_charset_id < XFASTINT (XCAR (tail)))
5821               max_charset_id = XFASTINT (XCAR (tail));
5822           safe_charsets = make_uninit_string (max_charset_id + 1);
5823           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5824           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5825                tail = XCDR (tail))
5826             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5827           coding->max_charset_id = max_charset_id;
5828           coding->safe_charsets = SDATA (safe_charsets);
5829           coding->spec.emacs_mule.full_support = 1;
5830         }
5831       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5832       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5833     }
5834   else if (EQ (coding_type, Qshift_jis))
5835     {
5836       coding->detector = detect_coding_sjis;
5837       coding->decoder = decode_coding_sjis;
5838       coding->encoder = encode_coding_sjis;
5839       coding->common_flags
5840         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5841     }
5842   else if (EQ (coding_type, Qbig5))
5843     {
5844       coding->detector = detect_coding_big5;
5845       coding->decoder = decode_coding_big5;
5846       coding->encoder = encode_coding_big5;
5847       coding->common_flags
5848         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5849     }
5850   else                          /* EQ (coding_type, Qraw_text) */
5851     {
5852       coding->detector = NULL;
5853       coding->decoder = decode_coding_raw_text;
5854       coding->encoder = encode_coding_raw_text;
5855       if (! EQ (eol_type, Qunix))
5856         {
5857           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5858           if (! VECTORP (eol_type))
5859             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5860         }
5861
5862     }
5863
5864   return;
5865 }
5866
5867 /* Return a list of charsets supported by CODING.  */
5868
5869 Lisp_Object
5870 coding_charset_list (struct coding_system *coding)
5871 {
5872   Lisp_Object attrs, charset_list;
5873
5874   CODING_GET_INFO (coding, attrs, charset_list);
5875   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5876     {
5877       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5878
5879       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5880         charset_list = Viso_2022_charset_list;
5881     }
5882   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5883     {
5884       charset_list = Vemacs_mule_charset_list;
5885     }
5886   return charset_list;
5887 }
5888
5889
5890 /* Return a list of charsets supported by CODING-SYSTEM.  */
5891
5892 Lisp_Object
5893 coding_system_charset_list (Lisp_Object coding_system)
5894 {
5895   int id;
5896   Lisp_Object attrs, charset_list;
5897
5898   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
5899   attrs = CODING_ID_ATTRS (id);
5900
5901   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5902     {
5903       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5904
5905       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5906         charset_list = Viso_2022_charset_list;
5907       else
5908         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5909     }
5910   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5911     {
5912       charset_list = Vemacs_mule_charset_list;
5913     }
5914   else
5915     {
5916       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
5917     }
5918   return charset_list;
5919 }
5920
5921
5922 /* Return raw-text or one of its subsidiaries that has the same
5923    eol_type as CODING-SYSTEM.  */
5924
5925 Lisp_Object
5926 raw_text_coding_system (Lisp_Object coding_system)
5927 {
5928   Lisp_Object spec, attrs;
5929   Lisp_Object eol_type, raw_text_eol_type;
5930
5931   if (NILP (coding_system))
5932     return Qraw_text;
5933   spec = CODING_SYSTEM_SPEC (coding_system);
5934   attrs = AREF (spec, 0);
5935
5936   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5937     return coding_system;
5938
5939   eol_type = AREF (spec, 2);
5940   if (VECTORP (eol_type))
5941     return Qraw_text;
5942   spec = CODING_SYSTEM_SPEC (Qraw_text);
5943   raw_text_eol_type = AREF (spec, 2);
5944   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5945           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5946           : AREF (raw_text_eol_type, 2));
5947 }
5948
5949
5950 /* If CODING_SYSTEM doesn't specify end-of-line format, return one of
5951    the subsidiary that has the same eol-spec as PARENT (if it is not
5952    nil and specifies end-of-line format) or the system's setting
5953    (system_eol_type).  */
5954
5955 Lisp_Object
5956 coding_inherit_eol_type (Lisp_Object coding_system, Lisp_Object parent)
5957 {
5958   Lisp_Object spec, eol_type;
5959
5960   if (NILP (coding_system))
5961     coding_system = Qraw_text;
5962   spec = CODING_SYSTEM_SPEC (coding_system);
5963   eol_type = AREF (spec, 2);
5964   if (VECTORP (eol_type))
5965     {
5966       Lisp_Object parent_eol_type;
5967
5968       if (! NILP (parent))
5969         {
5970           Lisp_Object parent_spec;
5971
5972           parent_spec = CODING_SYSTEM_SPEC (parent);
5973           parent_eol_type = AREF (parent_spec, 2);
5974           if (VECTORP (parent_eol_type))
5975             parent_eol_type = system_eol_type;
5976         }
5977       else
5978         parent_eol_type = system_eol_type;
5979       if (EQ (parent_eol_type, Qunix))
5980         coding_system = AREF (eol_type, 0);
5981       else if (EQ (parent_eol_type, Qdos))
5982         coding_system = AREF (eol_type, 1);
5983       else if (EQ (parent_eol_type, Qmac))
5984         coding_system = AREF (eol_type, 2);
5985     }
5986   return coding_system;
5987 }
5988
5989
5990 /* Check if text-conversion and eol-conversion of CODING_SYSTEM are
5991    decided for writing to a process.  If not, complement them, and
5992    return a new coding system.  */
5993
5994 Lisp_Object
5995 complement_process_encoding_system (Lisp_Object coding_system)
5996 {
5997   Lisp_Object coding_base = Qnil, eol_base = Qnil;
5998   Lisp_Object spec, attrs;
5999   int i;
6000
6001   for (i = 0; i < 3; i++)
6002     {
6003       if (i == 1)
6004         coding_system = CDR_SAFE (Vdefault_process_coding_system);
6005       else if (i == 2)
6006         coding_system = preferred_coding_system ();
6007       spec = CODING_SYSTEM_SPEC (coding_system);
6008       if (NILP (spec))
6009         continue;
6010       attrs = AREF (spec, 0);
6011       if (NILP (coding_base) && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6012         coding_base = CODING_ATTR_BASE_NAME (attrs);
6013       if (NILP (eol_base) && ! VECTORP (AREF (spec, 2)))
6014         eol_base = coding_system;
6015       if (! NILP (coding_base) && ! NILP (eol_base))
6016         break;
6017     }
6018
6019   if (i > 0)
6020     /* The original CODING_SYSTEM didn't specify text-conversion or
6021        eol-conversion.  Be sure that we return a fully complemented
6022        coding system.  */
6023     coding_system = coding_inherit_eol_type (coding_base, eol_base);
6024   return coding_system;
6025 }
6026
6027
6028 /* Emacs has a mechanism to automatically detect a coding system if it
6029    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6030    it's impossible to distinguish some coding systems accurately
6031    because they use the same range of codes.  So, at first, coding
6032    systems are categorized into 7, those are:
6033
6034    o coding-category-emacs-mule
6035
6036         The category for a coding system which has the same code range
6037         as Emacs' internal format.  Assigned the coding-system (Lisp
6038         symbol) `emacs-mule' by default.
6039
6040    o coding-category-sjis
6041
6042         The category for a coding system which has the same code range
6043         as SJIS.  Assigned the coding-system (Lisp
6044         symbol) `japanese-shift-jis' by default.
6045
6046    o coding-category-iso-7
6047
6048         The category for a coding system which has the same code range
6049         as ISO2022 of 7-bit environment.  This doesn't use any locking
6050         shift and single shift functions.  This can encode/decode all
6051         charsets.  Assigned the coding-system (Lisp symbol)
6052         `iso-2022-7bit' by default.
6053
6054    o coding-category-iso-7-tight
6055
6056         Same as coding-category-iso-7 except that this can
6057         encode/decode only the specified charsets.
6058
6059    o coding-category-iso-8-1
6060
6061         The category for a coding system which has the same code range
6062         as ISO2022 of 8-bit environment and graphic plane 1 used only
6063         for DIMENSION1 charset.  This doesn't use any locking shift
6064         and single shift functions.  Assigned the coding-system (Lisp
6065         symbol) `iso-latin-1' by default.
6066
6067    o coding-category-iso-8-2
6068
6069         The category for a coding system which has the same code range
6070         as ISO2022 of 8-bit environment and graphic plane 1 used only
6071         for DIMENSION2 charset.  This doesn't use any locking shift
6072         and single shift functions.  Assigned the coding-system (Lisp
6073         symbol) `japanese-iso-8bit' by default.
6074
6075    o coding-category-iso-7-else
6076
6077         The category for a coding system which has the same code range
6078         as ISO2022 of 7-bit environment but uses locking shift or
6079         single shift functions.  Assigned the coding-system (Lisp
6080         symbol) `iso-2022-7bit-lock' by default.
6081
6082    o coding-category-iso-8-else
6083
6084         The category for a coding system which has the same code range
6085         as ISO2022 of 8-bit environment but uses locking shift or
6086         single shift functions.  Assigned the coding-system (Lisp
6087         symbol) `iso-2022-8bit-ss2' by default.
6088
6089    o coding-category-big5
6090
6091         The category for a coding system which has the same code range
6092         as BIG5.  Assigned the coding-system (Lisp symbol)
6093         `cn-big5' by default.
6094
6095    o coding-category-utf-8
6096
6097         The category for a coding system which has the same code range
6098         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6099         symbol) `utf-8' by default.
6100
6101    o coding-category-utf-16-be
6102
6103         The category for a coding system in which a text has an
6104         Unicode signature (cf. Unicode Standard) in the order of BIG
6105         endian at the head.  Assigned the coding-system (Lisp symbol)
6106         `utf-16-be' by default.
6107
6108    o coding-category-utf-16-le
6109
6110         The category for a coding system in which a text has an
6111         Unicode signature (cf. Unicode Standard) in the order of
6112         LITTLE endian at the head.  Assigned the coding-system (Lisp
6113         symbol) `utf-16-le' by default.
6114
6115    o coding-category-ccl
6116
6117         The category for a coding system of which encoder/decoder is
6118         written in CCL programs.  The default value is nil, i.e., no
6119         coding system is assigned.
6120
6121    o coding-category-binary
6122
6123         The category for a coding system not categorized in any of the
6124         above.  Assigned the coding-system (Lisp symbol)
6125         `no-conversion' by default.
6126
6127    Each of them is a Lisp symbol and the value is an actual
6128    `coding-system's (this is also a Lisp symbol) assigned by a user.
6129    What Emacs does actually is to detect a category of coding system.
6130    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6131    decide only one possible category, it selects a category of the
6132    highest priority.  Priorities of categories are also specified by a
6133    user in a Lisp variable `coding-category-list'.
6134
6135 */
6136
6137 #define EOL_SEEN_NONE   0
6138 #define EOL_SEEN_LF     1
6139 #define EOL_SEEN_CR     2
6140 #define EOL_SEEN_CRLF   4
6141
6142 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6143    SOURCE is encoded.  If CATEGORY is one of
6144    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6145    two-byte, else they are encoded by one-byte.
6146
6147    Return one of EOL_SEEN_XXX.  */
6148
6149 #define MAX_EOL_CHECK_COUNT 3
6150
6151 static int
6152 detect_eol (const unsigned char *source, EMACS_INT src_bytes,
6153             enum coding_category category)
6154 {
6155   const unsigned char *src = source, *src_end = src + src_bytes;
6156   unsigned char c;
6157   int total  = 0;
6158   int eol_seen = EOL_SEEN_NONE;
6159
6160   if ((1 << category) & CATEGORY_MASK_UTF_16)
6161     {
6162       int msb, lsb;
6163
6164       msb = category == (coding_category_utf_16_le
6165                          | coding_category_utf_16_le_nosig);
6166       lsb = 1 - msb;
6167
6168       while (src + 1 < src_end)
6169         {
6170           c = src[lsb];
6171           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6172             {
6173               int this_eol;
6174
6175               if (c == '\n')
6176                 this_eol = EOL_SEEN_LF;
6177               else if (src + 3 >= src_end
6178                        || src[msb + 2] != 0
6179                        || src[lsb + 2] != '\n')
6180                 this_eol = EOL_SEEN_CR;
6181               else
6182                 {
6183                   this_eol = EOL_SEEN_CRLF;
6184                   src += 2;
6185                 }
6186
6187               if (eol_seen == EOL_SEEN_NONE)
6188                 /* This is the first end-of-line.  */
6189                 eol_seen = this_eol;
6190               else if (eol_seen != this_eol)
6191                 {
6192                   /* The found type is different from what found before.
6193                      Allow for stray ^M characters in DOS EOL files.  */
6194                   if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6195                       || (eol_seen == EOL_SEEN_CRLF
6196                           && this_eol == EOL_SEEN_CR))
6197                     eol_seen = EOL_SEEN_CRLF;
6198                   else
6199                     {
6200                       eol_seen = EOL_SEEN_LF;
6201                       break;
6202                     }
6203                 }
6204               if (++total == MAX_EOL_CHECK_COUNT)
6205                 break;
6206             }
6207           src += 2;
6208         }
6209     }
6210   else
6211     while (src < src_end)
6212       {
6213         c = *src++;
6214         if (c == '\n' || c == '\r')
6215           {
6216             int this_eol;
6217
6218             if (c == '\n')
6219               this_eol = EOL_SEEN_LF;
6220             else if (src >= src_end || *src != '\n')
6221               this_eol = EOL_SEEN_CR;
6222             else
6223               this_eol = EOL_SEEN_CRLF, src++;
6224
6225             if (eol_seen == EOL_SEEN_NONE)
6226               /* This is the first end-of-line.  */
6227               eol_seen = this_eol;
6228             else if (eol_seen != this_eol)
6229               {
6230                 /* The found type is different from what found before.
6231                    Allow for stray ^M characters in DOS EOL files.  */
6232                 if ((eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF)
6233                     || (eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR))
6234                   eol_seen = EOL_SEEN_CRLF;
6235                 else
6236                   {
6237                     eol_seen = EOL_SEEN_LF;
6238                     break;
6239                   }
6240               }
6241             if (++total == MAX_EOL_CHECK_COUNT)
6242               break;
6243           }
6244       }
6245   return eol_seen;
6246 }
6247
6248
6249 static Lisp_Object
6250 adjust_coding_eol_type (struct coding_system *coding, int eol_seen)
6251 {
6252   Lisp_Object eol_type;
6253
6254   eol_type = CODING_ID_EOL_TYPE (coding->id);
6255   if (eol_seen & EOL_SEEN_LF)
6256     {
6257       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6258       eol_type = Qunix;
6259     }
6260   else if (eol_seen & EOL_SEEN_CRLF)
6261     {
6262       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6263       eol_type = Qdos;
6264     }
6265   else if (eol_seen & EOL_SEEN_CR)
6266     {
6267       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6268       eol_type = Qmac;
6269     }
6270   return eol_type;
6271 }
6272
6273 /* Detect how a text specified in CODING is encoded.  If a coding
6274    system is detected, update fields of CODING by the detected coding
6275    system.  */
6276
6277 void
6278 detect_coding (struct coding_system *coding)
6279 {
6280   const unsigned char *src, *src_end;
6281   int saved_mode = coding->mode;
6282
6283   coding->consumed = coding->consumed_char = 0;
6284   coding->produced = coding->produced_char = 0;
6285   coding_set_source (coding);
6286
6287   src_end = coding->source + coding->src_bytes;
6288   coding->head_ascii = 0;
6289
6290   /* If we have not yet decided the text encoding type, detect it
6291      now.  */
6292   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6293     {
6294       int c, i;
6295       struct coding_detection_info detect_info;
6296       int null_byte_found = 0, eight_bit_found = 0;
6297
6298       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6299       for (src = coding->source; src < src_end; src++)
6300         {
6301           c = *src;
6302           if (c & 0x80)
6303             {
6304               eight_bit_found = 1;
6305               if (null_byte_found)
6306                 break;
6307             }
6308           else if (c < 0x20)
6309             {
6310               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6311                   && ! inhibit_iso_escape_detection
6312                   && ! detect_info.checked)
6313                 {
6314                   if (detect_coding_iso_2022 (coding, &detect_info))
6315                     {
6316                       /* We have scanned the whole data.  */
6317                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6318                         {
6319                           /* We didn't find an 8-bit code.  We may
6320                              have found a null-byte, but it's very
6321                              rare that a binary file conforms to
6322                              ISO-2022.  */
6323                           src = src_end;
6324                           coding->head_ascii = src - coding->source;
6325                         }
6326                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6327                       break;
6328                     }
6329                 }
6330               else if (! c && !inhibit_null_byte_detection)
6331                 {
6332                   null_byte_found = 1;
6333                   if (eight_bit_found)
6334                     break;
6335                 }
6336               if (! eight_bit_found)
6337                 coding->head_ascii++;
6338             }
6339           else if (! eight_bit_found)
6340             coding->head_ascii++;
6341         }
6342
6343       if (null_byte_found || eight_bit_found
6344           || coding->head_ascii < coding->src_bytes
6345           || detect_info.found)
6346         {
6347           enum coding_category category;
6348           struct coding_system *this;
6349
6350           if (coding->head_ascii == coding->src_bytes)
6351             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6352             for (i = 0; i < coding_category_raw_text; i++)
6353               {
6354                 category = coding_priorities[i];
6355                 this = coding_categories + category;
6356                 if (detect_info.found & (1 << category))
6357                   break;
6358               }
6359           else
6360             {
6361               if (null_byte_found)
6362                 {
6363                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6364                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6365                 }
6366               for (i = 0; i < coding_category_raw_text; i++)
6367                 {
6368                   category = coding_priorities[i];
6369                   this = coding_categories + category;
6370                   if (this->id < 0)
6371                     {
6372                       /* No coding system of this category is defined.  */
6373                       detect_info.rejected |= (1 << category);
6374                     }
6375                   else if (category >= coding_category_raw_text)
6376                     continue;
6377                   else if (detect_info.checked & (1 << category))
6378                     {
6379                       if (detect_info.found & (1 << category))
6380                         break;
6381                     }
6382                   else if ((*(this->detector)) (coding, &detect_info)
6383                            && detect_info.found & (1 << category))
6384                     {
6385                       if (category == coding_category_utf_16_auto)
6386                         {
6387                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6388                             category = coding_category_utf_16_le;
6389                           else
6390                             category = coding_category_utf_16_be;
6391                         }
6392                       break;
6393                     }
6394                 }
6395             }
6396
6397           if (i < coding_category_raw_text)
6398             setup_coding_system (CODING_ID_NAME (this->id), coding);
6399           else if (null_byte_found)
6400             setup_coding_system (Qno_conversion, coding);
6401           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6402                    == CATEGORY_MASK_ANY)
6403             setup_coding_system (Qraw_text, coding);
6404           else if (detect_info.rejected)
6405             for (i = 0; i < coding_category_raw_text; i++)
6406               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6407                 {
6408                   this = coding_categories + coding_priorities[i];
6409                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6410                   break;
6411                 }
6412         }
6413     }
6414   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6415            == coding_category_utf_8_auto)
6416     {
6417       Lisp_Object coding_systems;
6418       struct coding_detection_info detect_info;
6419
6420       coding_systems
6421         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6422       detect_info.found = detect_info.rejected = 0;
6423       coding->head_ascii = 0;
6424       if (CONSP (coding_systems)
6425           && detect_coding_utf_8 (coding, &detect_info))
6426         {
6427           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6428             setup_coding_system (XCAR (coding_systems), coding);
6429           else
6430             setup_coding_system (XCDR (coding_systems), coding);
6431         }
6432     }
6433   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6434            == coding_category_utf_16_auto)
6435     {
6436       Lisp_Object coding_systems;
6437       struct coding_detection_info detect_info;
6438
6439       coding_systems
6440         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6441       detect_info.found = detect_info.rejected = 0;
6442       coding->head_ascii = 0;
6443       if (CONSP (coding_systems)
6444           && detect_coding_utf_16 (coding, &detect_info))
6445         {
6446           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6447             setup_coding_system (XCAR (coding_systems), coding);
6448           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6449             setup_coding_system (XCDR (coding_systems), coding);
6450         }
6451     }
6452   coding->mode = saved_mode;
6453 }
6454
6455
6456 static void
6457 decode_eol (struct coding_system *coding)
6458 {
6459   Lisp_Object eol_type;
6460   unsigned char *p, *pbeg, *pend;
6461
6462   eol_type = CODING_ID_EOL_TYPE (coding->id);
6463   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6464     return;
6465
6466   if (NILP (coding->dst_object))
6467     pbeg = coding->destination;
6468   else
6469     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6470   pend = pbeg + coding->produced;
6471
6472   if (VECTORP (eol_type))
6473     {
6474       int eol_seen = EOL_SEEN_NONE;
6475
6476       for (p = pbeg; p < pend; p++)
6477         {
6478           if (*p == '\n')
6479             eol_seen |= EOL_SEEN_LF;
6480           else if (*p == '\r')
6481             {
6482               if (p + 1 < pend && *(p + 1) == '\n')
6483                 {
6484                   eol_seen |= EOL_SEEN_CRLF;
6485                   p++;
6486                 }
6487               else
6488                 eol_seen |= EOL_SEEN_CR;
6489             }
6490         }
6491       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6492       if ((eol_seen & EOL_SEEN_CRLF) != 0
6493           && (eol_seen & EOL_SEEN_CR) != 0
6494           && (eol_seen & EOL_SEEN_LF) == 0)
6495         eol_seen = EOL_SEEN_CRLF;
6496       else if (eol_seen != EOL_SEEN_NONE
6497           && eol_seen != EOL_SEEN_LF
6498           && eol_seen != EOL_SEEN_CRLF
6499           && eol_seen != EOL_SEEN_CR)
6500         eol_seen = EOL_SEEN_LF;
6501       if (eol_seen != EOL_SEEN_NONE)
6502         eol_type = adjust_coding_eol_type (coding, eol_seen);
6503     }
6504
6505   if (EQ (eol_type, Qmac))
6506     {
6507       for (p = pbeg; p < pend; p++)
6508         if (*p == '\r')
6509           *p = '\n';
6510     }
6511   else if (EQ (eol_type, Qdos))
6512     {
6513       int n = 0;
6514
6515       if (NILP (coding->dst_object))
6516         {
6517           /* Start deleting '\r' from the tail to minimize the memory
6518              movement.  */
6519           for (p = pend - 2; p >= pbeg; p--)
6520             if (*p == '\r')
6521               {
6522                 memmove (p, p + 1, pend-- - p - 1);
6523                 n++;
6524               }
6525         }
6526       else
6527         {
6528           int pos_byte = coding->dst_pos_byte;
6529           int pos = coding->dst_pos;
6530           int pos_end = pos + coding->produced_char - 1;
6531
6532           while (pos < pos_end)
6533             {
6534               p = BYTE_POS_ADDR (pos_byte);
6535               if (*p == '\r' && p[1] == '\n')
6536                 {
6537                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6538                   n++;
6539                   pos_end--;
6540                 }
6541               pos++;
6542               if (coding->dst_multibyte)
6543                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6544               else
6545                 pos_byte++;
6546             }
6547         }
6548       coding->produced -= n;
6549       coding->produced_char -= n;
6550     }
6551 }
6552
6553
6554 /* Return a translation table (or list of them) from coding system
6555    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6556    decoding (ENCODEP is zero). */
6557
6558 static Lisp_Object
6559 get_translation_table (Lisp_Object attrs, int encodep, int *max_lookup)
6560 {
6561   Lisp_Object standard, translation_table;
6562   Lisp_Object val;
6563
6564   if (NILP (Venable_character_translation))
6565     {
6566       if (max_lookup)
6567         *max_lookup = 0;
6568       return Qnil;
6569     }
6570   if (encodep)
6571     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6572       standard = Vstandard_translation_table_for_encode;
6573   else
6574     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6575       standard = Vstandard_translation_table_for_decode;
6576   if (NILP (translation_table))
6577     translation_table = standard;
6578   else
6579     {
6580       if (SYMBOLP (translation_table))
6581         translation_table = Fget (translation_table, Qtranslation_table);
6582       else if (CONSP (translation_table))
6583         {
6584           translation_table = Fcopy_sequence (translation_table);
6585           for (val = translation_table; CONSP (val); val = XCDR (val))
6586             if (SYMBOLP (XCAR (val)))
6587               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6588         }
6589       if (CHAR_TABLE_P (standard))
6590         {
6591           if (CONSP (translation_table))
6592             translation_table = nconc2 (translation_table,
6593                                         Fcons (standard, Qnil));
6594           else
6595             translation_table = Fcons (translation_table,
6596                                        Fcons (standard, Qnil));
6597         }
6598     }
6599
6600   if (max_lookup)
6601     {
6602       *max_lookup = 1;
6603       if (CHAR_TABLE_P (translation_table)
6604           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6605         {
6606           val = XCHAR_TABLE (translation_table)->extras[1];
6607           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6608             *max_lookup = XFASTINT (val);
6609         }
6610       else if (CONSP (translation_table))
6611         {
6612           Lisp_Object tail, val;
6613
6614           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6615             if (CHAR_TABLE_P (XCAR (tail))
6616                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6617               {
6618                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6619                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6620                   *max_lookup = XFASTINT (val);
6621               }
6622         }
6623     }
6624   return translation_table;
6625 }
6626
6627 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6628   do {                                                          \
6629     trans = Qnil;                                               \
6630     if (CHAR_TABLE_P (table))                                   \
6631       {                                                         \
6632         trans = CHAR_TABLE_REF (table, c);                      \
6633         if (CHARACTERP (trans))                                 \
6634           c = XFASTINT (trans), trans = Qnil;                   \
6635       }                                                         \
6636     else if (CONSP (table))                                     \
6637       {                                                         \
6638         Lisp_Object tail;                                       \
6639                                                                 \
6640         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6641           if (CHAR_TABLE_P (XCAR (tail)))                       \
6642             {                                                   \
6643               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6644               if (CHARACTERP (trans))                           \
6645                 c = XFASTINT (trans), trans = Qnil;             \
6646               else if (! NILP (trans))                          \
6647                 break;                                          \
6648             }                                                   \
6649       }                                                         \
6650   } while (0)
6651
6652
6653 /* Return a translation of character(s) at BUF according to TRANS.
6654    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6655    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6656    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6657    translation is found, and Qnil if not found..
6658    If BUF is too short to lookup characters in FROM, return Qt.  */
6659
6660 static Lisp_Object
6661 get_translation (Lisp_Object trans, int *buf, int *buf_end)
6662 {
6663
6664   if (INTEGERP (trans))
6665     return trans;
6666   for (; CONSP (trans); trans = XCDR (trans))
6667     {
6668       Lisp_Object val = XCAR (trans);
6669       Lisp_Object from = XCAR (val);
6670       int len = ASIZE (from);
6671       int i;
6672
6673       for (i = 0; i < len; i++)
6674         {
6675           if (buf + i == buf_end)
6676             return Qt;
6677           if (XINT (AREF (from, i)) != buf[i])
6678             break;
6679         }
6680       if (i == len)
6681         return val;
6682     }
6683   return Qnil;
6684 }
6685
6686
6687 static int
6688 produce_chars (struct coding_system *coding, Lisp_Object translation_table,
6689                int last_block)
6690 {
6691   unsigned char *dst = coding->destination + coding->produced;
6692   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6693   EMACS_INT produced;
6694   EMACS_INT produced_chars = 0;
6695   int carryover = 0;
6696
6697   if (! coding->chars_at_source)
6698     {
6699       /* Source characters are in coding->charbuf.  */
6700       int *buf = coding->charbuf;
6701       int *buf_end = buf + coding->charbuf_used;
6702
6703       if (EQ (coding->src_object, coding->dst_object))
6704         {
6705           coding_set_source (coding);
6706           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6707         }
6708
6709       while (buf < buf_end)
6710         {
6711           int c = *buf, i;
6712
6713           if (c >= 0)
6714             {
6715               int from_nchars = 1, to_nchars = 1;
6716               Lisp_Object trans = Qnil;
6717
6718               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6719               if (! NILP (trans))
6720                 {
6721                   trans = get_translation (trans, buf, buf_end);
6722                   if (INTEGERP (trans))
6723                     c = XINT (trans);
6724                   else if (CONSP (trans))
6725                     {
6726                       from_nchars = ASIZE (XCAR (trans));
6727                       trans = XCDR (trans);
6728                       if (INTEGERP (trans))
6729                         c = XINT (trans);
6730                       else
6731                         {
6732                           to_nchars = ASIZE (trans);
6733                           c = XINT (AREF (trans, 0));
6734                         }
6735                     }
6736                   else if (EQ (trans, Qt) && ! last_block)
6737                     break;
6738                 }
6739
6740               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6741                 {
6742                   dst = alloc_destination (coding,
6743                                            buf_end - buf
6744                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6745                                            dst);
6746                   if (EQ (coding->src_object, coding->dst_object))
6747                     {
6748                       coding_set_source (coding);
6749                       dst_end = (((unsigned char *) coding->source)
6750                                  + coding->consumed);
6751                     }
6752                   else
6753                     dst_end = coding->destination + coding->dst_bytes;
6754                 }
6755
6756               for (i = 0; i < to_nchars; i++)
6757                 {
6758                   if (i > 0)
6759                     c = XINT (AREF (trans, i));
6760                   if (coding->dst_multibyte
6761                       || ! CHAR_BYTE8_P (c))
6762                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6763                   else
6764                     *dst++ = CHAR_TO_BYTE8 (c);
6765                 }
6766               produced_chars += to_nchars;
6767               buf += from_nchars;
6768             }
6769           else
6770             /* This is an annotation datum.  (-C) is the length.  */
6771             buf += -c;
6772         }
6773       carryover = buf_end - buf;
6774     }
6775   else
6776     {
6777       /* Source characters are at coding->source.  */
6778       const unsigned char *src = coding->source;
6779       const unsigned char *src_end = src + coding->consumed;
6780
6781       if (EQ (coding->dst_object, coding->src_object))
6782         dst_end = (unsigned char *) src;
6783       if (coding->src_multibyte != coding->dst_multibyte)
6784         {
6785           if (coding->src_multibyte)
6786             {
6787               int multibytep = 1;
6788               EMACS_INT consumed_chars = 0;
6789
6790               while (1)
6791                 {
6792                   const unsigned char *src_base = src;
6793                   int c;
6794
6795                   ONE_MORE_BYTE (c);
6796                   if (dst == dst_end)
6797                     {
6798                       if (EQ (coding->src_object, coding->dst_object))
6799                         dst_end = (unsigned char *) src;
6800                       if (dst == dst_end)
6801                         {
6802                           EMACS_INT offset = src - coding->source;
6803
6804                           dst = alloc_destination (coding, src_end - src + 1,
6805                                                    dst);
6806                           dst_end = coding->destination + coding->dst_bytes;
6807                           coding_set_source (coding);
6808                           src = coding->source + offset;
6809                           src_end = coding->source + coding->src_bytes;
6810                           if (EQ (coding->src_object, coding->dst_object))
6811                             dst_end = (unsigned char *) src;
6812                         }
6813                     }
6814                   *dst++ = c;
6815                   produced_chars++;
6816                 }
6817             no_more_source:
6818               ;
6819             }
6820           else
6821             while (src < src_end)
6822               {
6823                 int multibytep = 1;
6824                 int c = *src++;
6825
6826                 if (dst >= dst_end - 1)
6827                   {
6828                     if (EQ (coding->src_object, coding->dst_object))
6829                       dst_end = (unsigned char *) src;
6830                     if (dst >= dst_end - 1)
6831                       {
6832                         EMACS_INT offset = src - coding->source;
6833                         EMACS_INT more_bytes;
6834
6835                         if (EQ (coding->src_object, coding->dst_object))
6836                           more_bytes = ((src_end - src) / 2) + 2;
6837                         else
6838                           more_bytes = src_end - src + 2;
6839                         dst = alloc_destination (coding, more_bytes, dst);
6840                         dst_end = coding->destination + coding->dst_bytes;
6841                         coding_set_source (coding);
6842                         src = coding->source + offset;
6843                         src_end = coding->source + coding->src_bytes;
6844                         if (EQ (coding->src_object, coding->dst_object))
6845                           dst_end = (unsigned char *) src;
6846                       }
6847                   }
6848                 EMIT_ONE_BYTE (c);
6849               }
6850         }
6851       else
6852         {
6853           if (!EQ (coding->src_object, coding->dst_object))
6854             {
6855               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6856
6857               if (require > 0)
6858                 {
6859                   EMACS_INT offset = src - coding->source;
6860
6861                   dst = alloc_destination (coding, require, dst);
6862                   coding_set_source (coding);
6863                   src = coding->source + offset;
6864                   src_end = coding->source + coding->src_bytes;
6865                 }
6866             }
6867           produced_chars = coding->consumed_char;
6868           while (src < src_end)
6869             *dst++ = *src++;
6870         }
6871     }
6872
6873   produced = dst - (coding->destination + coding->produced);
6874   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6875     insert_from_gap (produced_chars, produced);
6876   coding->produced += produced;
6877   coding->produced_char += produced_chars;
6878   return carryover;
6879 }
6880
6881 /* Compose text in CODING->object according to the annotation data at
6882    CHARBUF.  CHARBUF is an array:
6883      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6884  */
6885
6886 static INLINE void
6887 produce_composition (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6888 {
6889   int len;
6890   EMACS_INT to;
6891   enum composition_method method;
6892   Lisp_Object components;
6893
6894   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6895   to = pos + charbuf[2];
6896   method = (enum composition_method) (charbuf[4]);
6897
6898   if (method == COMPOSITION_RELATIVE)
6899     components = Qnil;
6900   else
6901     {
6902       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6903       int i, j;
6904
6905       if (method == COMPOSITION_WITH_RULE)
6906         len = charbuf[2] * 3 - 2;
6907       charbuf += MAX_ANNOTATION_LENGTH;
6908       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6909       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6910         {
6911           if (charbuf[i] >= 0)
6912             args[j] = make_number (charbuf[i]);
6913           else
6914             {
6915               i++;
6916               args[j] = make_number (charbuf[i] % 0x100);
6917             }
6918         }
6919       components = (i == j ? Fstring (j, args) : Fvector (j, args));
6920     }
6921   compose_text (pos, to, components, Qnil, coding->dst_object);
6922 }
6923
6924
6925 /* Put `charset' property on text in CODING->object according to
6926    the annotation data at CHARBUF.  CHARBUF is an array:
6927      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6928  */
6929
6930 static INLINE void
6931 produce_charset (struct coding_system *coding, int *charbuf, EMACS_INT pos)
6932 {
6933   EMACS_INT from = pos - charbuf[2];
6934   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6935
6936   Fput_text_property (make_number (from), make_number (pos),
6937                       Qcharset, CHARSET_NAME (charset),
6938                       coding->dst_object);
6939 }
6940
6941
6942 #define CHARBUF_SIZE 0x4000
6943
6944 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6945   do {                                                                  \
6946     int size = CHARBUF_SIZE;                                            \
6947                                                                         \
6948     coding->charbuf = NULL;                                             \
6949     while (size > 1024)                                                 \
6950       {                                                                 \
6951         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6952         if (coding->charbuf)                                            \
6953           break;                                                        \
6954         size >>= 1;                                                     \
6955       }                                                                 \
6956     if (! coding->charbuf)                                              \
6957       {                                                                 \
6958         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6959         return coding->result;                                          \
6960       }                                                                 \
6961     coding->charbuf_size = size;                                        \
6962   } while (0)
6963
6964
6965 static void
6966 produce_annotation (struct coding_system *coding, EMACS_INT pos)
6967 {
6968   int *charbuf = coding->charbuf;
6969   int *charbuf_end = charbuf + coding->charbuf_used;
6970
6971   if (NILP (coding->dst_object))
6972     return;
6973
6974   while (charbuf < charbuf_end)
6975     {
6976       if (*charbuf >= 0)
6977         pos++, charbuf++;
6978       else
6979         {
6980           int len = -*charbuf;
6981
6982           if (len > 2)
6983             switch (charbuf[1])
6984               {
6985               case CODING_ANNOTATE_COMPOSITION_MASK:
6986                 produce_composition (coding, charbuf, pos);
6987                 break;
6988               case CODING_ANNOTATE_CHARSET_MASK:
6989                 produce_charset (coding, charbuf, pos);
6990                 break;
6991               }
6992           charbuf += len;
6993         }
6994     }
6995 }
6996
6997 /* Decode the data at CODING->src_object into CODING->dst_object.
6998    CODING->src_object is a buffer, a string, or nil.
6999    CODING->dst_object is a buffer.
7000
7001    If CODING->src_object is a buffer, it must be the current buffer.
7002    In this case, if CODING->src_pos is positive, it is a position of
7003    the source text in the buffer, otherwise, the source text is in the
7004    gap area of the buffer, and CODING->src_pos specifies the offset of
7005    the text from GPT (which must be the same as PT).  If this is the
7006    same buffer as CODING->dst_object, CODING->src_pos must be
7007    negative.
7008
7009    If CODING->src_object is a string, CODING->src_pos is an index to
7010    that string.
7011
7012    If CODING->src_object is nil, CODING->source must already point to
7013    the non-relocatable memory area.  In this case, CODING->src_pos is
7014    an offset from CODING->source.
7015
7016    The decoded data is inserted at the current point of the buffer
7017    CODING->dst_object.
7018 */
7019
7020 static int
7021 decode_coding (struct coding_system *coding)
7022 {
7023   Lisp_Object attrs;
7024   Lisp_Object undo_list;
7025   Lisp_Object translation_table;
7026   struct ccl_spec cclspec;
7027   int carryover;
7028   int i;
7029
7030   if (BUFFERP (coding->src_object)
7031       && coding->src_pos > 0
7032       && coding->src_pos < GPT
7033       && coding->src_pos + coding->src_chars > GPT)
7034     move_gap_both (coding->src_pos, coding->src_pos_byte);
7035
7036   undo_list = Qt;
7037   if (BUFFERP (coding->dst_object))
7038     {
7039       if (current_buffer != XBUFFER (coding->dst_object))
7040         set_buffer_internal (XBUFFER (coding->dst_object));
7041       if (GPT != PT)
7042         move_gap_both (PT, PT_BYTE);
7043       undo_list = BVAR (current_buffer, undo_list);
7044       BVAR (current_buffer, undo_list) = Qt;
7045     }
7046
7047   coding->consumed = coding->consumed_char = 0;
7048   coding->produced = coding->produced_char = 0;
7049   coding->chars_at_source = 0;
7050   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7051   coding->errors = 0;
7052
7053   ALLOC_CONVERSION_WORK_AREA (coding);
7054
7055   attrs = CODING_ID_ATTRS (coding->id);
7056   translation_table = get_translation_table (attrs, 0, NULL);
7057
7058   carryover = 0;
7059   if (coding->decoder == decode_coding_ccl)
7060     {
7061       coding->spec.ccl = &cclspec;
7062       setup_ccl_program (&cclspec.ccl, CODING_CCL_DECODER (coding));
7063     }
7064   do
7065     {
7066       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7067
7068       coding_set_source (coding);
7069       coding->annotated = 0;
7070       coding->charbuf_used = carryover;
7071       (*(coding->decoder)) (coding);
7072       coding_set_destination (coding);
7073       carryover = produce_chars (coding, translation_table, 0);
7074       if (coding->annotated)
7075         produce_annotation (coding, pos);
7076       for (i = 0; i < carryover; i++)
7077         coding->charbuf[i]
7078           = coding->charbuf[coding->charbuf_used - carryover + i];
7079     }
7080   while (coding->result == CODING_RESULT_INSUFFICIENT_DST
7081          || (coding->consumed < coding->src_bytes
7082              && (coding->result == CODING_RESULT_SUCCESS
7083                  || coding->result == CODING_RESULT_INVALID_SRC)));
7084
7085   if (carryover > 0)
7086     {
7087       coding_set_destination (coding);
7088       coding->charbuf_used = carryover;
7089       produce_chars (coding, translation_table, 1);
7090     }
7091
7092   coding->carryover_bytes = 0;
7093   if (coding->consumed < coding->src_bytes)
7094     {
7095       int nbytes = coding->src_bytes - coding->consumed;
7096       const unsigned char *src;
7097
7098       coding_set_source (coding);
7099       coding_set_destination (coding);
7100       src = coding->source + coding->consumed;
7101
7102       if (coding->mode & CODING_MODE_LAST_BLOCK)
7103         {
7104           /* Flush out unprocessed data as binary chars.  We are sure
7105              that the number of data is less than the size of
7106              coding->charbuf.  */
7107           coding->charbuf_used = 0;
7108           coding->chars_at_source = 0;
7109
7110           while (nbytes-- > 0)
7111             {
7112               int c = *src++;
7113
7114               if (c & 0x80)
7115                 c = BYTE8_TO_CHAR (c);
7116               coding->charbuf[coding->charbuf_used++] = c;
7117             }
7118           produce_chars (coding, Qnil, 1);
7119         }
7120       else
7121         {
7122           /* Record unprocessed bytes in coding->carryover.  We are
7123              sure that the number of data is less than the size of
7124              coding->carryover.  */
7125           unsigned char *p = coding->carryover;
7126
7127           if (nbytes > sizeof coding->carryover)
7128             nbytes = sizeof coding->carryover;
7129           coding->carryover_bytes = nbytes;
7130           while (nbytes-- > 0)
7131             *p++ = *src++;
7132         }
7133       coding->consumed = coding->src_bytes;
7134     }
7135
7136   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7137       && !inhibit_eol_conversion)
7138     decode_eol (coding);
7139   if (BUFFERP (coding->dst_object))
7140     {
7141       BVAR (current_buffer, undo_list) = undo_list;
7142       record_insert (coding->dst_pos, coding->produced_char);
7143     }
7144   return coding->result;
7145 }
7146
7147
7148 /* Extract an annotation datum from a composition starting at POS and
7149    ending before LIMIT of CODING->src_object (buffer or string), store
7150    the data in BUF, set *STOP to a starting position of the next
7151    composition (if any) or to LIMIT, and return the address of the
7152    next element of BUF.
7153
7154    If such an annotation is not found, set *STOP to a starting
7155    position of a composition after POS (if any) or to LIMIT, and
7156    return BUF.  */
7157
7158 static INLINE int *
7159 handle_composition_annotation (EMACS_INT pos, EMACS_INT limit,
7160                                struct coding_system *coding, int *buf,
7161                                EMACS_INT *stop)
7162 {
7163   EMACS_INT start, end;
7164   Lisp_Object prop;
7165
7166   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7167       || end > limit)
7168     *stop = limit;
7169   else if (start > pos)
7170     *stop = start;
7171   else
7172     {
7173       if (start == pos)
7174         {
7175           /* We found a composition.  Store the corresponding
7176              annotation data in BUF.  */
7177           int *head = buf;
7178           enum composition_method method = COMPOSITION_METHOD (prop);
7179           int nchars = COMPOSITION_LENGTH (prop);
7180
7181           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7182           if (method != COMPOSITION_RELATIVE)
7183             {
7184               Lisp_Object components;
7185               int len, i, i_byte;
7186
7187               components = COMPOSITION_COMPONENTS (prop);
7188               if (VECTORP (components))
7189                 {
7190                   len = XVECTOR (components)->size;
7191                   for (i = 0; i < len; i++)
7192                     *buf++ = XINT (AREF (components, i));
7193                 }
7194               else if (STRINGP (components))
7195                 {
7196                   len = SCHARS (components);
7197                   i = i_byte = 0;
7198                   while (i < len)
7199                     {
7200                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7201                       buf++;
7202                     }
7203                 }
7204               else if (INTEGERP (components))
7205                 {
7206                   len = 1;
7207                   *buf++ = XINT (components);
7208                 }
7209               else if (CONSP (components))
7210                 {
7211                   for (len = 0; CONSP (components);
7212                        len++, components = XCDR (components))
7213                     *buf++ = XINT (XCAR (components));
7214                 }
7215               else
7216                 abort ();
7217               *head -= len;
7218             }
7219         }
7220
7221       if (find_composition (end, limit, &start, &end, &prop,
7222                             coding->src_object)
7223           && end <= limit)
7224         *stop = start;
7225       else
7226         *stop = limit;
7227     }
7228   return buf;
7229 }
7230
7231
7232 /* Extract an annotation datum from a text property `charset' at POS of
7233    CODING->src_object (buffer of string), store the data in BUF, set
7234    *STOP to the position where the value of `charset' property changes
7235    (limiting by LIMIT), and return the address of the next element of
7236    BUF.
7237
7238    If the property value is nil, set *STOP to the position where the
7239    property value is non-nil (limiting by LIMIT), and return BUF.  */
7240
7241 static INLINE int *
7242 handle_charset_annotation (EMACS_INT pos, EMACS_INT limit,
7243                            struct coding_system *coding, int *buf,
7244                            EMACS_INT *stop)
7245 {
7246   Lisp_Object val, next;
7247   int id;
7248
7249   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7250   if (! NILP (val) && CHARSETP (val))
7251     id = XINT (CHARSET_SYMBOL_ID (val));
7252   else
7253     id = -1;
7254   ADD_CHARSET_DATA (buf, 0, id);
7255   next = Fnext_single_property_change (make_number (pos), Qcharset,
7256                                        coding->src_object,
7257                                        make_number (limit));
7258   *stop = XINT (next);
7259   return buf;
7260 }
7261
7262
7263 static void
7264 consume_chars (struct coding_system *coding, Lisp_Object translation_table,
7265                int max_lookup)
7266 {
7267   int *buf = coding->charbuf;
7268   int *buf_end = coding->charbuf + coding->charbuf_size;
7269   const unsigned char *src = coding->source + coding->consumed;
7270   const unsigned char *src_end = coding->source + coding->src_bytes;
7271   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7272   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7273   int multibytep = coding->src_multibyte;
7274   Lisp_Object eol_type;
7275   int c;
7276   EMACS_INT stop, stop_composition, stop_charset;
7277   int *lookup_buf = NULL;
7278
7279   if (! NILP (translation_table))
7280     lookup_buf = alloca (sizeof (int) * max_lookup);
7281
7282   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7283   if (VECTORP (eol_type))
7284     eol_type = Qunix;
7285
7286   /* Note: composition handling is not yet implemented.  */
7287   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7288
7289   if (NILP (coding->src_object))
7290     stop = stop_composition = stop_charset = end_pos;
7291   else
7292     {
7293       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7294         stop = stop_composition = pos;
7295       else
7296         stop = stop_composition = end_pos;
7297       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7298         stop = stop_charset = pos;
7299       else
7300         stop_charset = end_pos;
7301     }
7302
7303   /* Compensate for CRLF and conversion.  */
7304   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7305   while (buf < buf_end)
7306     {
7307       Lisp_Object trans;
7308
7309       if (pos == stop)
7310         {
7311           if (pos == end_pos)
7312             break;
7313           if (pos == stop_composition)
7314             buf = handle_composition_annotation (pos, end_pos, coding,
7315                                                  buf, &stop_composition);
7316           if (pos == stop_charset)
7317             buf = handle_charset_annotation (pos, end_pos, coding,
7318                                              buf, &stop_charset);
7319           stop = (stop_composition < stop_charset
7320                   ? stop_composition : stop_charset);
7321         }
7322
7323       if (! multibytep)
7324         {
7325           EMACS_INT bytes;
7326
7327           if (coding->encoder == encode_coding_raw_text
7328               || coding->encoder == encode_coding_ccl)
7329             c = *src++, pos++;
7330           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7331             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7332           else
7333             c = BYTE8_TO_CHAR (*src), src++, pos++;
7334         }
7335       else
7336         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7337       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7338         c = '\n';
7339       if (! EQ (eol_type, Qunix))
7340         {
7341           if (c == '\n')
7342             {
7343               if (EQ (eol_type, Qdos))
7344                 *buf++ = '\r';
7345               else
7346                 c = '\r';
7347             }
7348         }
7349
7350       trans = Qnil;
7351       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7352       if (NILP (trans))
7353         *buf++ = c;
7354       else
7355         {
7356           int from_nchars = 1, to_nchars = 1;
7357           int *lookup_buf_end;
7358           const unsigned char *p = src;
7359           int i;
7360
7361           lookup_buf[0] = c;
7362           for (i = 1; i < max_lookup && p < src_end; i++)
7363             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7364           lookup_buf_end = lookup_buf + i;
7365           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7366           if (INTEGERP (trans))
7367             c = XINT (trans);
7368           else if (CONSP (trans))
7369             {
7370               from_nchars = ASIZE (XCAR (trans));
7371               trans = XCDR (trans);
7372               if (INTEGERP (trans))
7373                 c = XINT (trans);
7374               else
7375                 {
7376                   to_nchars = ASIZE (trans);
7377                   if (buf + to_nchars > buf_end)
7378                     break;
7379                   c = XINT (AREF (trans, 0));
7380                 }
7381             }
7382           else
7383             break;
7384           *buf++ = c;
7385           for (i = 1; i < to_nchars; i++)
7386             *buf++ = XINT (AREF (trans, i));
7387           for (i = 1; i < from_nchars; i++, pos++)
7388             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7389         }
7390     }
7391
7392   coding->consumed = src - coding->source;
7393   coding->consumed_char = pos - coding->src_pos;
7394   coding->charbuf_used = buf - coding->charbuf;
7395   coding->chars_at_source = 0;
7396 }
7397
7398
7399 /* Encode the text at CODING->src_object into CODING->dst_object.
7400    CODING->src_object is a buffer or a string.
7401    CODING->dst_object is a buffer or nil.
7402
7403    If CODING->src_object is a buffer, it must be the current buffer.
7404    In this case, if CODING->src_pos is positive, it is a position of
7405    the source text in the buffer, otherwise. the source text is in the
7406    gap area of the buffer, and coding->src_pos specifies the offset of
7407    the text from GPT (which must be the same as PT).  If this is the
7408    same buffer as CODING->dst_object, CODING->src_pos must be
7409    negative and CODING should not have `pre-write-conversion'.
7410
7411    If CODING->src_object is a string, CODING should not have
7412    `pre-write-conversion'.
7413
7414    If CODING->dst_object is a buffer, the encoded data is inserted at
7415    the current point of that buffer.
7416
7417    If CODING->dst_object is nil, the encoded data is placed at the
7418    memory area specified by CODING->destination.  */
7419
7420 static int
7421 encode_coding (struct coding_system *coding)
7422 {
7423   Lisp_Object attrs;
7424   Lisp_Object translation_table;
7425   int max_lookup;
7426   struct ccl_spec cclspec;
7427
7428   attrs = CODING_ID_ATTRS (coding->id);
7429   if (coding->encoder == encode_coding_raw_text)
7430     translation_table = Qnil, max_lookup = 0;
7431   else
7432     translation_table = get_translation_table (attrs, 1, &max_lookup);
7433
7434   if (BUFFERP (coding->dst_object))
7435     {
7436       set_buffer_internal (XBUFFER (coding->dst_object));
7437       coding->dst_multibyte
7438         = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7439     }
7440
7441   coding->consumed = coding->consumed_char = 0;
7442   coding->produced = coding->produced_char = 0;
7443   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7444   coding->errors = 0;
7445
7446   ALLOC_CONVERSION_WORK_AREA (coding);
7447
7448   if (coding->encoder == encode_coding_ccl)
7449     {
7450       coding->spec.ccl = &cclspec;
7451       setup_ccl_program (&cclspec.ccl, CODING_CCL_ENCODER (coding));
7452     }
7453   do {
7454     coding_set_source (coding);
7455     consume_chars (coding, translation_table, max_lookup);
7456     coding_set_destination (coding);
7457     (*(coding->encoder)) (coding);
7458   } while (coding->consumed_char < coding->src_chars);
7459
7460   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7461     insert_from_gap (coding->produced_char, coding->produced);
7462
7463   return (coding->result);
7464 }
7465
7466
7467 /* Name (or base name) of work buffer for code conversion.  */
7468 static Lisp_Object Vcode_conversion_workbuf_name;
7469
7470 /* A working buffer used by the top level conversion.  Once it is
7471    created, it is never destroyed.  It has the name
7472    Vcode_conversion_workbuf_name.  The other working buffers are
7473    destroyed after the use is finished, and their names are modified
7474    versions of Vcode_conversion_workbuf_name.  */
7475 static Lisp_Object Vcode_conversion_reused_workbuf;
7476
7477 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7478 static int reused_workbuf_in_use;
7479
7480
7481 /* Return a working buffer of code conversion.  MULTIBYTE specifies the
7482    multibyteness of returning buffer.  */
7483
7484 static Lisp_Object
7485 make_conversion_work_buffer (int multibyte)
7486 {
7487   Lisp_Object name, workbuf;
7488   struct buffer *current;
7489
7490   if (reused_workbuf_in_use++)
7491     {
7492       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7493       workbuf = Fget_buffer_create (name);
7494     }
7495   else
7496     {
7497       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7498         Vcode_conversion_reused_workbuf
7499           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7500       workbuf = Vcode_conversion_reused_workbuf;
7501     }
7502   current = current_buffer;
7503   set_buffer_internal (XBUFFER (workbuf));
7504   /* We can't allow modification hooks to run in the work buffer.  For
7505      instance, directory_files_internal assumes that file decoding
7506      doesn't compile new regexps.  */
7507   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7508   Ferase_buffer ();
7509   BVAR (current_buffer, undo_list) = Qt;
7510   BVAR (current_buffer, enable_multibyte_characters) = multibyte ? Qt : Qnil;
7511   set_buffer_internal (current);
7512   return workbuf;
7513 }
7514
7515
7516 static Lisp_Object
7517 code_conversion_restore (Lisp_Object arg)
7518 {
7519   Lisp_Object current, workbuf;
7520   struct gcpro gcpro1;
7521
7522   GCPRO1 (arg);
7523   current = XCAR (arg);
7524   workbuf = XCDR (arg);
7525   if (! NILP (workbuf))
7526     {
7527       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7528         reused_workbuf_in_use = 0;
7529       else if (! NILP (Fbuffer_live_p (workbuf)))
7530         Fkill_buffer (workbuf);
7531     }
7532   set_buffer_internal (XBUFFER (current));
7533   UNGCPRO;
7534   return Qnil;
7535 }
7536
7537 Lisp_Object
7538 code_conversion_save (int with_work_buf, int multibyte)
7539 {
7540   Lisp_Object workbuf = Qnil;
7541
7542   if (with_work_buf)
7543     workbuf = make_conversion_work_buffer (multibyte);
7544   record_unwind_protect (code_conversion_restore,
7545                          Fcons (Fcurrent_buffer (), workbuf));
7546   return workbuf;
7547 }
7548
7549 int
7550 decode_coding_gap (struct coding_system *coding,
7551                    EMACS_INT chars, EMACS_INT bytes)
7552 {
7553   int count = SPECPDL_INDEX ();
7554   Lisp_Object attrs;
7555
7556   code_conversion_save (0, 0);
7557
7558   coding->src_object = Fcurrent_buffer ();
7559   coding->src_chars = chars;
7560   coding->src_bytes = bytes;
7561   coding->src_pos = -chars;
7562   coding->src_pos_byte = -bytes;
7563   coding->src_multibyte = chars < bytes;
7564   coding->dst_object = coding->src_object;
7565   coding->dst_pos = PT;
7566   coding->dst_pos_byte = PT_BYTE;
7567   coding->dst_multibyte = ! NILP (BVAR (current_buffer, enable_multibyte_characters));
7568
7569   if (CODING_REQUIRE_DETECTION (coding))
7570     detect_coding (coding);
7571
7572   coding->mode |= CODING_MODE_LAST_BLOCK;
7573   current_buffer->text->inhibit_shrinking = 1;
7574   decode_coding (coding);
7575   current_buffer->text->inhibit_shrinking = 0;
7576
7577   attrs = CODING_ID_ATTRS (coding->id);
7578   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7579     {
7580       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7581       Lisp_Object val;
7582
7583       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7584       val = call1 (CODING_ATTR_POST_READ (attrs),
7585                    make_number (coding->produced_char));
7586       CHECK_NATNUM (val);
7587       coding->produced_char += Z - prev_Z;
7588       coding->produced += Z_BYTE - prev_Z_BYTE;
7589     }
7590
7591   unbind_to (count, Qnil);
7592   return coding->result;
7593 }
7594
7595 int
7596 encode_coding_gap (struct coding_system *coding,
7597                    EMACS_INT chars, EMACS_INT bytes)
7598 {
7599   int count = SPECPDL_INDEX ();
7600
7601   code_conversion_save (0, 0);
7602
7603   coding->src_object = Fcurrent_buffer ();
7604   coding->src_chars = chars;
7605   coding->src_bytes = bytes;
7606   coding->src_pos = -chars;
7607   coding->src_pos_byte = -bytes;
7608   coding->src_multibyte = chars < bytes;
7609   coding->dst_object = coding->src_object;
7610   coding->dst_pos = PT;
7611   coding->dst_pos_byte = PT_BYTE;
7612
7613   encode_coding (coding);
7614
7615   unbind_to (count, Qnil);
7616   return coding->result;
7617 }
7618
7619
7620 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7621    SRC_OBJECT into DST_OBJECT by coding context CODING.
7622
7623    SRC_OBJECT is a buffer, a string, or Qnil.
7624
7625    If it is a buffer, the text is at point of the buffer.  FROM and TO
7626    are positions in the buffer.
7627
7628    If it is a string, the text is at the beginning of the string.
7629    FROM and TO are indices to the string.
7630
7631    If it is nil, the text is at coding->source.  FROM and TO are
7632    indices to coding->source.
7633
7634    DST_OBJECT is a buffer, Qt, or Qnil.
7635
7636    If it is a buffer, the decoded text is inserted at point of the
7637    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7638    is deleted.
7639
7640    If it is Qt, a string is made from the decoded text, and
7641    set in CODING->dst_object.
7642
7643    If it is Qnil, the decoded text is stored at CODING->destination.
7644    The caller must allocate CODING->dst_bytes bytes at
7645    CODING->destination by xmalloc.  If the decoded text is longer than
7646    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7647  */
7648
7649 void
7650 decode_coding_object (struct coding_system *coding,
7651                       Lisp_Object src_object,
7652                       EMACS_INT from, EMACS_INT from_byte,
7653                       EMACS_INT to, EMACS_INT to_byte,
7654                       Lisp_Object dst_object)
7655 {
7656   int count = SPECPDL_INDEX ();
7657   unsigned char *destination;
7658   EMACS_INT dst_bytes;
7659   EMACS_INT chars = to - from;
7660   EMACS_INT bytes = to_byte - from_byte;
7661   Lisp_Object attrs;
7662   int saved_pt = -1, saved_pt_byte;
7663   int need_marker_adjustment = 0;
7664   Lisp_Object old_deactivate_mark;
7665
7666   old_deactivate_mark = Vdeactivate_mark;
7667
7668   if (NILP (dst_object))
7669     {
7670       destination = coding->destination;
7671       dst_bytes = coding->dst_bytes;
7672     }
7673
7674   coding->src_object = src_object;
7675   coding->src_chars = chars;
7676   coding->src_bytes = bytes;
7677   coding->src_multibyte = chars < bytes;
7678
7679   if (STRINGP (src_object))
7680     {
7681       coding->src_pos = from;
7682       coding->src_pos_byte = from_byte;
7683     }
7684   else if (BUFFERP (src_object))
7685     {
7686       set_buffer_internal (XBUFFER (src_object));
7687       if (from != GPT)
7688         move_gap_both (from, from_byte);
7689       if (EQ (src_object, dst_object))
7690         {
7691           struct Lisp_Marker *tail;
7692
7693           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7694             {
7695               tail->need_adjustment
7696                 = tail->charpos == (tail->insertion_type ? from : to);
7697               need_marker_adjustment |= tail->need_adjustment;
7698             }
7699           saved_pt = PT, saved_pt_byte = PT_BYTE;
7700           TEMP_SET_PT_BOTH (from, from_byte);
7701           current_buffer->text->inhibit_shrinking = 1;
7702           del_range_both (from, from_byte, to, to_byte, 1);
7703           coding->src_pos = -chars;
7704           coding->src_pos_byte = -bytes;
7705         }
7706       else
7707         {
7708           coding->src_pos = from;
7709           coding->src_pos_byte = from_byte;
7710         }
7711     }
7712
7713   if (CODING_REQUIRE_DETECTION (coding))
7714     detect_coding (coding);
7715   attrs = CODING_ID_ATTRS (coding->id);
7716
7717   if (EQ (dst_object, Qt)
7718       || (! NILP (CODING_ATTR_POST_READ (attrs))
7719           && NILP (dst_object)))
7720     {
7721       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7722       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7723       coding->dst_pos = BEG;
7724       coding->dst_pos_byte = BEG_BYTE;
7725     }
7726   else if (BUFFERP (dst_object))
7727     {
7728       code_conversion_save (0, 0);
7729       coding->dst_object = dst_object;
7730       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7731       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7732       coding->dst_multibyte
7733         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7734     }
7735   else
7736     {
7737       code_conversion_save (0, 0);
7738       coding->dst_object = Qnil;
7739       /* Most callers presume this will return a multibyte result, and they
7740          won't use `binary' or `raw-text' anyway, so let's not worry about
7741          CODING_FOR_UNIBYTE.  */
7742       coding->dst_multibyte = 1;
7743     }
7744
7745   decode_coding (coding);
7746
7747   if (BUFFERP (coding->dst_object))
7748     set_buffer_internal (XBUFFER (coding->dst_object));
7749
7750   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7751     {
7752       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7753       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7754       Lisp_Object val;
7755
7756       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7757       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7758               old_deactivate_mark);
7759       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7760                         make_number (coding->produced_char));
7761       UNGCPRO;
7762       CHECK_NATNUM (val);
7763       coding->produced_char += Z - prev_Z;
7764       coding->produced += Z_BYTE - prev_Z_BYTE;
7765     }
7766
7767   if (EQ (dst_object, Qt))
7768     {
7769       coding->dst_object = Fbuffer_string ();
7770     }
7771   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7772     {
7773       set_buffer_internal (XBUFFER (coding->dst_object));
7774       if (dst_bytes < coding->produced)
7775         {
7776           destination = xrealloc (destination, coding->produced);
7777           if (! destination)
7778             {
7779               record_conversion_result (coding,
7780                                         CODING_RESULT_INSUFFICIENT_MEM);
7781               unbind_to (count, Qnil);
7782               return;
7783             }
7784           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7785             move_gap_both (BEGV, BEGV_BYTE);
7786           memcpy (destination, BEGV_ADDR, coding->produced);
7787           coding->destination = destination;
7788         }
7789     }
7790
7791   if (saved_pt >= 0)
7792     {
7793       /* This is the case of:
7794          (BUFFERP (src_object) && EQ (src_object, dst_object))
7795          As we have moved PT while replacing the original buffer
7796          contents, we must recover it now.  */
7797       set_buffer_internal (XBUFFER (src_object));
7798       current_buffer->text->inhibit_shrinking = 0;
7799       if (saved_pt < from)
7800         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7801       else if (saved_pt < from + chars)
7802         TEMP_SET_PT_BOTH (from, from_byte);
7803       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
7804         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7805                           saved_pt_byte + (coding->produced - bytes));
7806       else
7807         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7808                           saved_pt_byte + (coding->produced - bytes));
7809
7810       if (need_marker_adjustment)
7811         {
7812           struct Lisp_Marker *tail;
7813
7814           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7815             if (tail->need_adjustment)
7816               {
7817                 tail->need_adjustment = 0;
7818                 if (tail->insertion_type)
7819                   {
7820                     tail->bytepos = from_byte;
7821                     tail->charpos = from;
7822                   }
7823                 else
7824                   {
7825                     tail->bytepos = from_byte + coding->produced;
7826                     tail->charpos
7827                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
7828                          ? tail->bytepos : from + coding->produced_char);
7829                   }
7830               }
7831         }
7832     }
7833
7834   Vdeactivate_mark = old_deactivate_mark;
7835   unbind_to (count, coding->dst_object);
7836 }
7837
7838
7839 void
7840 encode_coding_object (struct coding_system *coding,
7841                       Lisp_Object src_object,
7842                       EMACS_INT from, EMACS_INT from_byte,
7843                       EMACS_INT to, EMACS_INT to_byte,
7844                       Lisp_Object dst_object)
7845 {
7846   int count = SPECPDL_INDEX ();
7847   EMACS_INT chars = to - from;
7848   EMACS_INT bytes = to_byte - from_byte;
7849   Lisp_Object attrs;
7850   int saved_pt = -1, saved_pt_byte;
7851   int need_marker_adjustment = 0;
7852   int kill_src_buffer = 0;
7853   Lisp_Object old_deactivate_mark;
7854
7855   old_deactivate_mark = Vdeactivate_mark;
7856
7857   coding->src_object = src_object;
7858   coding->src_chars = chars;
7859   coding->src_bytes = bytes;
7860   coding->src_multibyte = chars < bytes;
7861
7862   attrs = CODING_ID_ATTRS (coding->id);
7863
7864   if (EQ (src_object, dst_object))
7865     {
7866       struct Lisp_Marker *tail;
7867
7868       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7869         {
7870           tail->need_adjustment
7871             = tail->charpos == (tail->insertion_type ? from : to);
7872           need_marker_adjustment |= tail->need_adjustment;
7873         }
7874     }
7875
7876   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7877     {
7878       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7879       set_buffer_internal (XBUFFER (coding->src_object));
7880       if (STRINGP (src_object))
7881         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7882       else if (BUFFERP (src_object))
7883         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7884       else
7885         insert_1_both ((char *) coding->source + from, chars, bytes, 0, 0, 0);
7886
7887       if (EQ (src_object, dst_object))
7888         {
7889           set_buffer_internal (XBUFFER (src_object));
7890           saved_pt = PT, saved_pt_byte = PT_BYTE;
7891           del_range_both (from, from_byte, to, to_byte, 1);
7892           set_buffer_internal (XBUFFER (coding->src_object));
7893         }
7894
7895       {
7896         Lisp_Object args[3];
7897         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7898
7899         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7900                 old_deactivate_mark);
7901         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7902         args[1] = make_number (BEG);
7903         args[2] = make_number (Z);
7904         safe_call (3, args);
7905         UNGCPRO;
7906       }
7907       if (XBUFFER (coding->src_object) != current_buffer)
7908         kill_src_buffer = 1;
7909       coding->src_object = Fcurrent_buffer ();
7910       if (BEG != GPT)
7911         move_gap_both (BEG, BEG_BYTE);
7912       coding->src_chars = Z - BEG;
7913       coding->src_bytes = Z_BYTE - BEG_BYTE;
7914       coding->src_pos = BEG;
7915       coding->src_pos_byte = BEG_BYTE;
7916       coding->src_multibyte = Z < Z_BYTE;
7917     }
7918   else if (STRINGP (src_object))
7919     {
7920       code_conversion_save (0, 0);
7921       coding->src_pos = from;
7922       coding->src_pos_byte = from_byte;
7923     }
7924   else if (BUFFERP (src_object))
7925     {
7926       code_conversion_save (0, 0);
7927       set_buffer_internal (XBUFFER (src_object));
7928       if (EQ (src_object, dst_object))
7929         {
7930           saved_pt = PT, saved_pt_byte = PT_BYTE;
7931           coding->src_object = del_range_1 (from, to, 1, 1);
7932           coding->src_pos = 0;
7933           coding->src_pos_byte = 0;
7934         }
7935       else
7936         {
7937           if (from < GPT && to >= GPT)
7938             move_gap_both (from, from_byte);
7939           coding->src_pos = from;
7940           coding->src_pos_byte = from_byte;
7941         }
7942     }
7943   else
7944     code_conversion_save (0, 0);
7945
7946   if (BUFFERP (dst_object))
7947     {
7948       coding->dst_object = dst_object;
7949       if (EQ (src_object, dst_object))
7950         {
7951           coding->dst_pos = from;
7952           coding->dst_pos_byte = from_byte;
7953         }
7954       else
7955         {
7956           struct buffer *current = current_buffer;
7957
7958           set_buffer_temp (XBUFFER (dst_object));
7959           coding->dst_pos = PT;
7960           coding->dst_pos_byte = PT_BYTE;
7961           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
7962           set_buffer_temp (current);
7963         }
7964       coding->dst_multibyte
7965         = ! NILP (BVAR (XBUFFER (dst_object), enable_multibyte_characters));
7966     }
7967   else if (EQ (dst_object, Qt))
7968     {
7969       coding->dst_object = Qnil;
7970       coding->dst_bytes = coding->src_chars;
7971       if (coding->dst_bytes == 0)
7972         coding->dst_bytes = 1;
7973       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7974       coding->dst_multibyte = 0;
7975     }
7976   else
7977     {
7978       coding->dst_object = Qnil;
7979       coding->dst_multibyte = 0;
7980     }
7981
7982   encode_coding (coding);
7983
7984   if (EQ (dst_object, Qt))
7985     {
7986       if (BUFFERP (coding->dst_object))
7987         coding->dst_object = Fbuffer_string ();
7988       else
7989         {
7990           coding->dst_object
7991             = make_unibyte_string ((char *) coding->destination,
7992                                    coding->produced);
7993           xfree (coding->destination);
7994         }
7995     }
7996
7997   if (saved_pt >= 0)
7998     {
7999       /* This is the case of:
8000          (BUFFERP (src_object) && EQ (src_object, dst_object))
8001          As we have moved PT while replacing the original buffer
8002          contents, we must recover it now.  */
8003       set_buffer_internal (XBUFFER (src_object));
8004       if (saved_pt < from)
8005         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8006       else if (saved_pt < from + chars)
8007         TEMP_SET_PT_BOTH (from, from_byte);
8008       else if (! NILP (BVAR (current_buffer, enable_multibyte_characters)))
8009         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8010                           saved_pt_byte + (coding->produced - bytes));
8011       else
8012         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8013                           saved_pt_byte + (coding->produced - bytes));
8014
8015       if (need_marker_adjustment)
8016         {
8017           struct Lisp_Marker *tail;
8018
8019           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8020             if (tail->need_adjustment)
8021               {
8022                 tail->need_adjustment = 0;
8023                 if (tail->insertion_type)
8024                   {
8025                     tail->bytepos = from_byte;
8026                     tail->charpos = from;
8027                   }
8028                 else
8029                   {
8030                     tail->bytepos = from_byte + coding->produced;
8031                     tail->charpos
8032                       = (NILP (BVAR (current_buffer, enable_multibyte_characters))
8033                          ? tail->bytepos : from + coding->produced_char);
8034                   }
8035               }
8036         }
8037     }
8038
8039   if (kill_src_buffer)
8040     Fkill_buffer (coding->src_object);
8041
8042   Vdeactivate_mark = old_deactivate_mark;
8043   unbind_to (count, Qnil);
8044 }
8045
8046
8047 Lisp_Object
8048 preferred_coding_system (void)
8049 {
8050   int id = coding_categories[coding_priorities[0]].id;
8051
8052   return CODING_ID_NAME (id);
8053 }
8054
8055 \f
8056 #ifdef emacs
8057 /*** 8. Emacs Lisp library functions ***/
8058
8059 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8060        doc: /* Return t if OBJECT is nil or a coding-system.
8061 See the documentation of `define-coding-system' for information
8062 about coding-system objects.  */)
8063   (Lisp_Object object)
8064 {
8065   if (NILP (object)
8066       || CODING_SYSTEM_ID (object) >= 0)
8067     return Qt;
8068   if (! SYMBOLP (object)
8069       || NILP (Fget (object, Qcoding_system_define_form)))
8070     return Qnil;
8071   return Qt;
8072 }
8073
8074 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8075        Sread_non_nil_coding_system, 1, 1, 0,
8076        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8077   (Lisp_Object prompt)
8078 {
8079   Lisp_Object val;
8080   do
8081     {
8082       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8083                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8084     }
8085   while (SCHARS (val) == 0);
8086   return (Fintern (val, Qnil));
8087 }
8088
8089 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8090        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8091 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8092 Ignores case when completing coding systems (all Emacs coding systems
8093 are lower-case).  */)
8094   (Lisp_Object prompt, Lisp_Object default_coding_system)
8095 {
8096   Lisp_Object val;
8097   int count = SPECPDL_INDEX ();
8098
8099   if (SYMBOLP (default_coding_system))
8100     default_coding_system = SYMBOL_NAME (default_coding_system);
8101   specbind (Qcompletion_ignore_case, Qt);
8102   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8103                           Qt, Qnil, Qcoding_system_history,
8104                           default_coding_system, Qnil);
8105   unbind_to (count, Qnil);
8106   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8107 }
8108
8109 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8110        1, 1, 0,
8111        doc: /* Check validity of CODING-SYSTEM.
8112 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8113 It is valid if it is nil or a symbol defined as a coding system by the
8114 function `define-coding-system'.  */)
8115   (Lisp_Object coding_system)
8116 {
8117   Lisp_Object define_form;
8118
8119   define_form = Fget (coding_system, Qcoding_system_define_form);
8120   if (! NILP (define_form))
8121     {
8122       Fput (coding_system, Qcoding_system_define_form, Qnil);
8123       safe_eval (define_form);
8124     }
8125   if (!NILP (Fcoding_system_p (coding_system)))
8126     return coding_system;
8127   xsignal1 (Qcoding_system_error, coding_system);
8128 }
8129
8130 \f
8131 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8132    HIGHEST is nonzero, return the coding system of the highest
8133    priority among the detected coding systems.  Otherwise return a
8134    list of detected coding systems sorted by their priorities.  If
8135    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8136    multibyte form but contains only ASCII and eight-bit chars.
8137    Otherwise, the bytes are raw bytes.
8138
8139    CODING-SYSTEM controls the detection as below:
8140
8141    If it is nil, detect both text-format and eol-format.  If the
8142    text-format part of CODING-SYSTEM is already specified
8143    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8144    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8145    detect only text-format.  */
8146
8147 Lisp_Object
8148 detect_coding_system (const unsigned char *src,
8149                       EMACS_INT src_chars, EMACS_INT src_bytes,
8150                       int highest, int multibytep,
8151                       Lisp_Object coding_system)
8152 {
8153   const unsigned char *src_end = src + src_bytes;
8154   Lisp_Object attrs, eol_type;
8155   Lisp_Object val = Qnil;
8156   struct coding_system coding;
8157   int id;
8158   struct coding_detection_info detect_info;
8159   enum coding_category base_category;
8160   int null_byte_found = 0, eight_bit_found = 0;
8161
8162   if (NILP (coding_system))
8163     coding_system = Qundecided;
8164   setup_coding_system (coding_system, &coding);
8165   attrs = CODING_ID_ATTRS (coding.id);
8166   eol_type = CODING_ID_EOL_TYPE (coding.id);
8167   coding_system = CODING_ATTR_BASE_NAME (attrs);
8168
8169   coding.source = src;
8170   coding.src_chars = src_chars;
8171   coding.src_bytes = src_bytes;
8172   coding.src_multibyte = multibytep;
8173   coding.consumed = 0;
8174   coding.mode |= CODING_MODE_LAST_BLOCK;
8175   coding.head_ascii = 0;
8176
8177   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8178
8179   /* At first, detect text-format if necessary.  */
8180   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8181   if (base_category == coding_category_undecided)
8182     {
8183       enum coding_category category;
8184       struct coding_system *this;
8185       int c, i;
8186
8187       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8188       for (; src < src_end; src++)
8189         {
8190           c = *src;
8191           if (c & 0x80)
8192             {
8193               eight_bit_found = 1;
8194               if (null_byte_found)
8195                 break;
8196             }
8197           else if (c < 0x20)
8198             {
8199               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8200                   && ! inhibit_iso_escape_detection
8201                   && ! detect_info.checked)
8202                 {
8203                   if (detect_coding_iso_2022 (&coding, &detect_info))
8204                     {
8205                       /* We have scanned the whole data.  */
8206                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8207                         {
8208                           /* We didn't find an 8-bit code.  We may
8209                              have found a null-byte, but it's very
8210                              rare that a binary file confirm to
8211                              ISO-2022.  */
8212                           src = src_end;
8213                           coding.head_ascii = src - coding.source;
8214                         }
8215                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8216                       break;
8217                     }
8218                 }
8219               else if (! c && !inhibit_null_byte_detection)
8220                 {
8221                   null_byte_found = 1;
8222                   if (eight_bit_found)
8223                     break;
8224                 }
8225               if (! eight_bit_found)
8226                 coding.head_ascii++;
8227             }
8228           else if (! eight_bit_found)
8229             coding.head_ascii++;
8230         }
8231
8232       if (null_byte_found || eight_bit_found
8233           || coding.head_ascii < coding.src_bytes
8234           || detect_info.found)
8235         {
8236           if (coding.head_ascii == coding.src_bytes)
8237             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8238             for (i = 0; i < coding_category_raw_text; i++)
8239               {
8240                 category = coding_priorities[i];
8241                 this = coding_categories + category;
8242                 if (detect_info.found & (1 << category))
8243                   break;
8244               }
8245           else
8246             {
8247               if (null_byte_found)
8248                 {
8249                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8250                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8251                 }
8252               for (i = 0; i < coding_category_raw_text; i++)
8253                 {
8254                   category = coding_priorities[i];
8255                   this = coding_categories + category;
8256
8257                   if (this->id < 0)
8258                     {
8259                       /* No coding system of this category is defined.  */
8260                       detect_info.rejected |= (1 << category);
8261                     }
8262                   else if (category >= coding_category_raw_text)
8263                     continue;
8264                   else if (detect_info.checked & (1 << category))
8265                     {
8266                       if (highest
8267                           && (detect_info.found & (1 << category)))
8268                         break;
8269                     }
8270                   else if ((*(this->detector)) (&coding, &detect_info)
8271                            && highest
8272                            && (detect_info.found & (1 << category)))
8273                     {
8274                       if (category == coding_category_utf_16_auto)
8275                         {
8276                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8277                             category = coding_category_utf_16_le;
8278                           else
8279                             category = coding_category_utf_16_be;
8280                         }
8281                       break;
8282                     }
8283                 }
8284             }
8285         }
8286
8287       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8288           || null_byte_found)
8289         {
8290           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8291           id = CODING_SYSTEM_ID (Qno_conversion);
8292           val = Fcons (make_number (id), Qnil);
8293         }
8294       else if (! detect_info.rejected && ! detect_info.found)
8295         {
8296           detect_info.found = CATEGORY_MASK_ANY;
8297           id = coding_categories[coding_category_undecided].id;
8298           val = Fcons (make_number (id), Qnil);
8299         }
8300       else if (highest)
8301         {
8302           if (detect_info.found)
8303             {
8304               detect_info.found = 1 << category;
8305               val = Fcons (make_number (this->id), Qnil);
8306             }
8307           else
8308             for (i = 0; i < coding_category_raw_text; i++)
8309               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8310                 {
8311                   detect_info.found = 1 << coding_priorities[i];
8312                   id = coding_categories[coding_priorities[i]].id;
8313                   val = Fcons (make_number (id), Qnil);
8314                   break;
8315                 }
8316         }
8317       else
8318         {
8319           int mask = detect_info.rejected | detect_info.found;
8320           int found = 0;
8321
8322           for (i = coding_category_raw_text - 1; i >= 0; i--)
8323             {
8324               category = coding_priorities[i];
8325               if (! (mask & (1 << category)))
8326                 {
8327                   found |= 1 << category;
8328                   id = coding_categories[category].id;
8329                   if (id >= 0)
8330                     val = Fcons (make_number (id), val);
8331                 }
8332             }
8333           for (i = coding_category_raw_text - 1; i >= 0; i--)
8334             {
8335               category = coding_priorities[i];
8336               if (detect_info.found & (1 << category))
8337                 {
8338                   id = coding_categories[category].id;
8339                   val = Fcons (make_number (id), val);
8340                 }
8341             }
8342           detect_info.found |= found;
8343         }
8344     }
8345   else if (base_category == coding_category_utf_8_auto)
8346     {
8347       if (detect_coding_utf_8 (&coding, &detect_info))
8348         {
8349           struct coding_system *this;
8350
8351           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8352             this = coding_categories + coding_category_utf_8_sig;
8353           else
8354             this = coding_categories + coding_category_utf_8_nosig;
8355           val = Fcons (make_number (this->id), Qnil);
8356         }
8357     }
8358   else if (base_category == coding_category_utf_16_auto)
8359     {
8360       if (detect_coding_utf_16 (&coding, &detect_info))
8361         {
8362           struct coding_system *this;
8363
8364           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8365             this = coding_categories + coding_category_utf_16_le;
8366           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8367             this = coding_categories + coding_category_utf_16_be;
8368           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8369             this = coding_categories + coding_category_utf_16_be_nosig;
8370           else
8371             this = coding_categories + coding_category_utf_16_le_nosig;
8372           val = Fcons (make_number (this->id), Qnil);
8373         }
8374     }
8375   else
8376     {
8377       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8378       val = Fcons (make_number (coding.id), Qnil);
8379     }
8380
8381   /* Then, detect eol-format if necessary.  */
8382   {
8383     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8384     Lisp_Object tail;
8385
8386     if (VECTORP (eol_type))
8387       {
8388         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8389           {
8390             if (null_byte_found)
8391               normal_eol = EOL_SEEN_LF;
8392             else
8393               normal_eol = detect_eol (coding.source, src_bytes,
8394                                        coding_category_raw_text);
8395           }
8396         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8397                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8398           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8399                                       coding_category_utf_16_be);
8400         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8401                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8402           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8403                                       coding_category_utf_16_le);
8404       }
8405     else
8406       {
8407         if (EQ (eol_type, Qunix))
8408           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8409         else if (EQ (eol_type, Qdos))
8410           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8411         else
8412           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8413       }
8414
8415     for (tail = val; CONSP (tail); tail = XCDR (tail))
8416       {
8417         enum coding_category category;
8418         int this_eol;
8419
8420         id = XINT (XCAR (tail));
8421         attrs = CODING_ID_ATTRS (id);
8422         category = XINT (CODING_ATTR_CATEGORY (attrs));
8423         eol_type = CODING_ID_EOL_TYPE (id);
8424         if (VECTORP (eol_type))
8425           {
8426             if (category == coding_category_utf_16_be
8427                 || category == coding_category_utf_16_be_nosig)
8428               this_eol = utf_16_be_eol;
8429             else if (category == coding_category_utf_16_le
8430                      || category == coding_category_utf_16_le_nosig)
8431               this_eol = utf_16_le_eol;
8432             else
8433               this_eol = normal_eol;
8434
8435             if (this_eol == EOL_SEEN_LF)
8436               XSETCAR (tail, AREF (eol_type, 0));
8437             else if (this_eol == EOL_SEEN_CRLF)
8438               XSETCAR (tail, AREF (eol_type, 1));
8439             else if (this_eol == EOL_SEEN_CR)
8440               XSETCAR (tail, AREF (eol_type, 2));
8441             else
8442               XSETCAR (tail, CODING_ID_NAME (id));
8443           }
8444         else
8445           XSETCAR (tail, CODING_ID_NAME (id));
8446       }
8447   }
8448
8449   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8450 }
8451
8452
8453 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8454        2, 3, 0,
8455        doc: /* Detect coding system of the text in the region between START and END.
8456 Return a list of possible coding systems ordered by priority.
8457 The coding systems to try and their priorities follows what
8458 the function `coding-system-priority-list' (which see) returns.
8459
8460 If only ASCII characters are found (except for such ISO-2022 control
8461 characters as ESC), it returns a list of single element `undecided'
8462 or its subsidiary coding system according to a detected end-of-line
8463 format.
8464
8465 If optional argument HIGHEST is non-nil, return the coding system of
8466 highest priority.  */)
8467   (Lisp_Object start, Lisp_Object end, Lisp_Object highest)
8468 {
8469   int from, to;
8470   int from_byte, to_byte;
8471
8472   CHECK_NUMBER_COERCE_MARKER (start);
8473   CHECK_NUMBER_COERCE_MARKER (end);
8474
8475   validate_region (&start, &end);
8476   from = XINT (start), to = XINT (end);
8477   from_byte = CHAR_TO_BYTE (from);
8478   to_byte = CHAR_TO_BYTE (to);
8479
8480   if (from < GPT && to >= GPT)
8481     move_gap_both (to, to_byte);
8482
8483   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8484                                to - from, to_byte - from_byte,
8485                                !NILP (highest),
8486                                !NILP (BVAR (current_buffer
8487                                       , enable_multibyte_characters)),
8488                                Qnil);
8489 }
8490
8491 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8492        1, 2, 0,
8493        doc: /* Detect coding system of the text in STRING.
8494 Return a list of possible coding systems ordered by priority.
8495 The coding systems to try and their priorities follows what
8496 the function `coding-system-priority-list' (which see) returns.
8497
8498 If only ASCII characters are found (except for such ISO-2022 control
8499 characters as ESC), it returns a list of single element `undecided'
8500 or its subsidiary coding system according to a detected end-of-line
8501 format.
8502
8503 If optional argument HIGHEST is non-nil, return the coding system of
8504 highest priority.  */)
8505   (Lisp_Object string, Lisp_Object highest)
8506 {
8507   CHECK_STRING (string);
8508
8509   return detect_coding_system (SDATA (string),
8510                                SCHARS (string), SBYTES (string),
8511                                !NILP (highest), STRING_MULTIBYTE (string),
8512                                Qnil);
8513 }
8514
8515
8516 static INLINE int
8517 char_encodable_p (int c, Lisp_Object attrs)
8518 {
8519   Lisp_Object tail;
8520   struct charset *charset;
8521   Lisp_Object translation_table;
8522
8523   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8524   if (! NILP (translation_table))
8525     c = translate_char (translation_table, c);
8526   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8527        CONSP (tail); tail = XCDR (tail))
8528     {
8529       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8530       if (CHAR_CHARSET_P (c, charset))
8531         break;
8532     }
8533   return (! NILP (tail));
8534 }
8535
8536
8537 /* Return a list of coding systems that safely encode the text between
8538    START and END.  If EXCLUDE is non-nil, it is a list of coding
8539    systems not to check.  The returned list doesn't contain any such
8540    coding systems.  In any case, if the text contains only ASCII or is
8541    unibyte, return t.  */
8542
8543 DEFUN ("find-coding-systems-region-internal",
8544        Ffind_coding_systems_region_internal,
8545        Sfind_coding_systems_region_internal, 2, 3, 0,
8546        doc: /* Internal use only.  */)
8547   (Lisp_Object start, Lisp_Object end, Lisp_Object exclude)
8548 {
8549   Lisp_Object coding_attrs_list, safe_codings;
8550   EMACS_INT start_byte, end_byte;
8551   const unsigned char *p, *pbeg, *pend;
8552   int c;
8553   Lisp_Object tail, elt, work_table;
8554
8555   if (STRINGP (start))
8556     {
8557       if (!STRING_MULTIBYTE (start)
8558           || SCHARS (start) == SBYTES (start))
8559         return Qt;
8560       start_byte = 0;
8561       end_byte = SBYTES (start);
8562     }
8563   else
8564     {
8565       CHECK_NUMBER_COERCE_MARKER (start);
8566       CHECK_NUMBER_COERCE_MARKER (end);
8567       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8568         args_out_of_range (start, end);
8569       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8570         return Qt;
8571       start_byte = CHAR_TO_BYTE (XINT (start));
8572       end_byte = CHAR_TO_BYTE (XINT (end));
8573       if (XINT (end) - XINT (start) == end_byte - start_byte)
8574         return Qt;
8575
8576       if (XINT (start) < GPT && XINT (end) > GPT)
8577         {
8578           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8579             move_gap_both (XINT (start), start_byte);
8580           else
8581             move_gap_both (XINT (end), end_byte);
8582         }
8583     }
8584
8585   coding_attrs_list = Qnil;
8586   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8587     if (NILP (exclude)
8588         || NILP (Fmemq (XCAR (tail), exclude)))
8589       {
8590         Lisp_Object attrs;
8591
8592         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8593         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8594             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8595           {
8596             ASET (attrs, coding_attr_trans_tbl,
8597                   get_translation_table (attrs, 1, NULL));
8598             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8599           }
8600       }
8601
8602   if (STRINGP (start))
8603     p = pbeg = SDATA (start);
8604   else
8605     p = pbeg = BYTE_POS_ADDR (start_byte);
8606   pend = p + (end_byte - start_byte);
8607
8608   while (p < pend && ASCII_BYTE_P (*p)) p++;
8609   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8610
8611   work_table = Fmake_char_table (Qnil, Qnil);
8612   while (p < pend)
8613     {
8614       if (ASCII_BYTE_P (*p))
8615         p++;
8616       else
8617         {
8618           c = STRING_CHAR_ADVANCE (p);
8619           if (!NILP (char_table_ref (work_table, c)))
8620             /* This character was already checked.  Ignore it.  */
8621             continue;
8622
8623           charset_map_loaded = 0;
8624           for (tail = coding_attrs_list; CONSP (tail);)
8625             {
8626               elt = XCAR (tail);
8627               if (NILP (elt))
8628                 tail = XCDR (tail);
8629               else if (char_encodable_p (c, elt))
8630                 tail = XCDR (tail);
8631               else if (CONSP (XCDR (tail)))
8632                 {
8633                   XSETCAR (tail, XCAR (XCDR (tail)));
8634                   XSETCDR (tail, XCDR (XCDR (tail)));
8635                 }
8636               else
8637                 {
8638                   XSETCAR (tail, Qnil);
8639                   tail = XCDR (tail);
8640                 }
8641             }
8642           if (charset_map_loaded)
8643             {
8644               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8645
8646               if (STRINGP (start))
8647                 pbeg = SDATA (start);
8648               else
8649                 pbeg = BYTE_POS_ADDR (start_byte);
8650               p = pbeg + p_offset;
8651               pend = pbeg + pend_offset;
8652             }
8653           char_table_set (work_table, c, Qt);
8654         }
8655     }
8656
8657   safe_codings = list2 (Qraw_text, Qno_conversion);
8658   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8659     if (! NILP (XCAR (tail)))
8660       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8661
8662   return safe_codings;
8663 }
8664
8665
8666 DEFUN ("unencodable-char-position", Funencodable_char_position,
8667        Sunencodable_char_position, 3, 5, 0,
8668        doc: /*
8669 Return position of first un-encodable character in a region.
8670 START and END specify the region and CODING-SYSTEM specifies the
8671 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8672
8673 If optional 4th argument COUNT is non-nil, it specifies at most how
8674 many un-encodable characters to search.  In this case, the value is a
8675 list of positions.
8676
8677 If optional 5th argument STRING is non-nil, it is a string to search
8678 for un-encodable characters.  In that case, START and END are indexes
8679 to the string.  */)
8680   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object count, Lisp_Object string)
8681 {
8682   int n;
8683   struct coding_system coding;
8684   Lisp_Object attrs, charset_list, translation_table;
8685   Lisp_Object positions;
8686   int from, to;
8687   const unsigned char *p, *stop, *pend;
8688   int ascii_compatible;
8689
8690   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8691   attrs = CODING_ID_ATTRS (coding.id);
8692   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8693     return Qnil;
8694   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8695   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8696   translation_table = get_translation_table (attrs, 1, NULL);
8697
8698   if (NILP (string))
8699     {
8700       validate_region (&start, &end);
8701       from = XINT (start);
8702       to = XINT (end);
8703       if (NILP (BVAR (current_buffer, enable_multibyte_characters))
8704           || (ascii_compatible
8705               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8706         return Qnil;
8707       p = CHAR_POS_ADDR (from);
8708       pend = CHAR_POS_ADDR (to);
8709       if (from < GPT && to >= GPT)
8710         stop = GPT_ADDR;
8711       else
8712         stop = pend;
8713     }
8714   else
8715     {
8716       CHECK_STRING (string);
8717       CHECK_NATNUM (start);
8718       CHECK_NATNUM (end);
8719       from = XINT (start);
8720       to = XINT (end);
8721       if (from > to
8722           || to > SCHARS (string))
8723         args_out_of_range_3 (string, start, end);
8724       if (! STRING_MULTIBYTE (string))
8725         return Qnil;
8726       p = SDATA (string) + string_char_to_byte (string, from);
8727       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8728       if (ascii_compatible && (to - from) == (pend - p))
8729         return Qnil;
8730     }
8731
8732   if (NILP (count))
8733     n = 1;
8734   else
8735     {
8736       CHECK_NATNUM (count);
8737       n = XINT (count);
8738     }
8739
8740   positions = Qnil;
8741   while (1)
8742     {
8743       int c;
8744
8745       if (ascii_compatible)
8746         while (p < stop && ASCII_BYTE_P (*p))
8747           p++, from++;
8748       if (p >= stop)
8749         {
8750           if (p >= pend)
8751             break;
8752           stop = pend;
8753           p = GAP_END_ADDR;
8754         }
8755
8756       c = STRING_CHAR_ADVANCE (p);
8757       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8758           && ! char_charset (translate_char (translation_table, c),
8759                              charset_list, NULL))
8760         {
8761           positions = Fcons (make_number (from), positions);
8762           n--;
8763           if (n == 0)
8764             break;
8765         }
8766
8767       from++;
8768     }
8769
8770   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8771 }
8772
8773
8774 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8775        Scheck_coding_systems_region, 3, 3, 0,
8776        doc: /* Check if the region is encodable by coding systems.
8777
8778 START and END are buffer positions specifying the region.
8779 CODING-SYSTEM-LIST is a list of coding systems to check.
8780
8781 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8782 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8783 whole region, POS0, POS1, ... are buffer positions where non-encodable
8784 characters are found.
8785
8786 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8787 value is nil.
8788
8789 START may be a string.  In that case, check if the string is
8790 encodable, and the value contains indices to the string instead of
8791 buffer positions.  END is ignored.
8792
8793 If the current buffer (or START if it is a string) is unibyte, the value
8794 is nil.  */)
8795   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system_list)
8796 {
8797   Lisp_Object list;
8798   EMACS_INT start_byte, end_byte;
8799   int pos;
8800   const unsigned char *p, *pbeg, *pend;
8801   int c;
8802   Lisp_Object tail, elt, attrs;
8803
8804   if (STRINGP (start))
8805     {
8806       if (!STRING_MULTIBYTE (start)
8807           || SCHARS (start) == SBYTES (start))
8808         return Qnil;
8809       start_byte = 0;
8810       end_byte = SBYTES (start);
8811       pos = 0;
8812     }
8813   else
8814     {
8815       CHECK_NUMBER_COERCE_MARKER (start);
8816       CHECK_NUMBER_COERCE_MARKER (end);
8817       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8818         args_out_of_range (start, end);
8819       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
8820         return Qnil;
8821       start_byte = CHAR_TO_BYTE (XINT (start));
8822       end_byte = CHAR_TO_BYTE (XINT (end));
8823       if (XINT (end) - XINT (start) == end_byte - start_byte)
8824         return Qnil;
8825
8826       if (XINT (start) < GPT && XINT (end) > GPT)
8827         {
8828           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8829             move_gap_both (XINT (start), start_byte);
8830           else
8831             move_gap_both (XINT (end), end_byte);
8832         }
8833       pos = XINT (start);
8834     }
8835
8836   list = Qnil;
8837   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8838     {
8839       elt = XCAR (tail);
8840       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8841       ASET (attrs, coding_attr_trans_tbl,
8842             get_translation_table (attrs, 1, NULL));
8843       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8844     }
8845
8846   if (STRINGP (start))
8847     p = pbeg = SDATA (start);
8848   else
8849     p = pbeg = BYTE_POS_ADDR (start_byte);
8850   pend = p + (end_byte - start_byte);
8851
8852   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8853   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8854
8855   while (p < pend)
8856     {
8857       if (ASCII_BYTE_P (*p))
8858         p++;
8859       else
8860         {
8861           c = STRING_CHAR_ADVANCE (p);
8862
8863           charset_map_loaded = 0;
8864           for (tail = list; CONSP (tail); tail = XCDR (tail))
8865             {
8866               elt = XCDR (XCAR (tail));
8867               if (! char_encodable_p (c, XCAR (elt)))
8868                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8869             }
8870           if (charset_map_loaded)
8871             {
8872               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8873
8874               if (STRINGP (start))
8875                 pbeg = SDATA (start);
8876               else
8877                 pbeg = BYTE_POS_ADDR (start_byte);
8878               p = pbeg + p_offset;
8879               pend = pbeg + pend_offset;
8880             }
8881         }
8882       pos++;
8883     }
8884
8885   tail = list;
8886   list = Qnil;
8887   for (; CONSP (tail); tail = XCDR (tail))
8888     {
8889       elt = XCAR (tail);
8890       if (CONSP (XCDR (XCDR (elt))))
8891         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8892                       list);
8893     }
8894
8895   return list;
8896 }
8897
8898
8899 Lisp_Object
8900 code_convert_region (Lisp_Object start, Lisp_Object end,
8901                      Lisp_Object coding_system, Lisp_Object dst_object,
8902                      int encodep, int norecord)
8903 {
8904   struct coding_system coding;
8905   EMACS_INT from, from_byte, to, to_byte;
8906   Lisp_Object src_object;
8907
8908   CHECK_NUMBER_COERCE_MARKER (start);
8909   CHECK_NUMBER_COERCE_MARKER (end);
8910   if (NILP (coding_system))
8911     coding_system = Qno_conversion;
8912   else
8913     CHECK_CODING_SYSTEM (coding_system);
8914   src_object = Fcurrent_buffer ();
8915   if (NILP (dst_object))
8916     dst_object = src_object;
8917   else if (! EQ (dst_object, Qt))
8918     CHECK_BUFFER (dst_object);
8919
8920   validate_region (&start, &end);
8921   from = XFASTINT (start);
8922   from_byte = CHAR_TO_BYTE (from);
8923   to = XFASTINT (end);
8924   to_byte = CHAR_TO_BYTE (to);
8925
8926   setup_coding_system (coding_system, &coding);
8927   coding.mode |= CODING_MODE_LAST_BLOCK;
8928
8929   if (encodep)
8930     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8931                           dst_object);
8932   else
8933     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
8934                           dst_object);
8935   if (! norecord)
8936     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8937
8938   return (BUFFERP (dst_object)
8939           ? make_number (coding.produced_char)
8940           : coding.dst_object);
8941 }
8942
8943
8944 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
8945        3, 4, "r\nzCoding system: ",
8946        doc: /* Decode the current region from the specified coding system.
8947 When called from a program, takes four arguments:
8948         START, END, CODING-SYSTEM, and DESTINATION.
8949 START and END are buffer positions.
8950
8951 Optional 4th arguments DESTINATION specifies where the decoded text goes.
8952 If nil, the region between START and END is replaced by the decoded text.
8953 If buffer, the decoded text is inserted in that buffer after point (point
8954 does not move).
8955 In those cases, the length of the decoded text is returned.
8956 If DESTINATION is t, the decoded text is returned.
8957
8958 This function sets `last-coding-system-used' to the precise coding system
8959 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8960 not fully specified.)  */)
8961   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8962 {
8963   return code_convert_region (start, end, coding_system, destination, 0, 0);
8964 }
8965
8966 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
8967        3, 4, "r\nzCoding system: ",
8968        doc: /* Encode the current region by specified coding system.
8969 When called from a program, takes four arguments:
8970         START, END, CODING-SYSTEM and DESTINATION.
8971 START and END are buffer positions.
8972
8973 Optional 4th arguments DESTINATION specifies where the encoded text goes.
8974 If nil, the region between START and END is replace by the encoded text.
8975 If buffer, the encoded text is inserted in that buffer after point (point
8976 does not move).
8977 In those cases, the length of the encoded text is returned.
8978 If DESTINATION is t, the encoded text is returned.
8979
8980 This function sets `last-coding-system-used' to the precise coding system
8981 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8982 not fully specified.)  */)
8983   (Lisp_Object start, Lisp_Object end, Lisp_Object coding_system, Lisp_Object destination)
8984 {
8985   return code_convert_region (start, end, coding_system, destination, 1, 0);
8986 }
8987
8988 Lisp_Object
8989 code_convert_string (Lisp_Object string, Lisp_Object coding_system,
8990                      Lisp_Object dst_object, int encodep, int nocopy, int norecord)
8991 {
8992   struct coding_system coding;
8993   EMACS_INT chars, bytes;
8994
8995   CHECK_STRING (string);
8996   if (NILP (coding_system))
8997     {
8998       if (! norecord)
8999         Vlast_coding_system_used = Qno_conversion;
9000       if (NILP (dst_object))
9001         return (nocopy ? Fcopy_sequence (string) : string);
9002     }
9003
9004   if (NILP (coding_system))
9005     coding_system = Qno_conversion;
9006   else
9007     CHECK_CODING_SYSTEM (coding_system);
9008   if (NILP (dst_object))
9009     dst_object = Qt;
9010   else if (! EQ (dst_object, Qt))
9011     CHECK_BUFFER (dst_object);
9012
9013   setup_coding_system (coding_system, &coding);
9014   coding.mode |= CODING_MODE_LAST_BLOCK;
9015   chars = SCHARS (string);
9016   bytes = SBYTES (string);
9017   if (encodep)
9018     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9019   else
9020     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9021   if (! norecord)
9022     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9023
9024   return (BUFFERP (dst_object)
9025           ? make_number (coding.produced_char)
9026           : coding.dst_object);
9027 }
9028
9029
9030 /* Encode or decode STRING according to CODING_SYSTEM.
9031    Do not set Vlast_coding_system_used.
9032
9033    This function is called only from macros DECODE_FILE and
9034    ENCODE_FILE, thus we ignore character composition.  */
9035
9036 Lisp_Object
9037 code_convert_string_norecord (Lisp_Object string, Lisp_Object coding_system,
9038                               int encodep)
9039 {
9040   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9041 }
9042
9043
9044 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9045        2, 4, 0,
9046        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9047
9048 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9049 if the decoding operation is trivial.
9050
9051 Optional fourth arg BUFFER non-nil means that the decoded text is
9052 inserted in that buffer after point (point does not move).  In this
9053 case, the return value is the length of the decoded text.
9054
9055 This function sets `last-coding-system-used' to the precise coding system
9056 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9057 not fully specified.)  */)
9058   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9059 {
9060   return code_convert_string (string, coding_system, buffer,
9061                               0, ! NILP (nocopy), 0);
9062 }
9063
9064 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9065        2, 4, 0,
9066        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9067
9068 Optional third arg NOCOPY non-nil means it is OK to return STRING
9069 itself if the encoding operation is trivial.
9070
9071 Optional fourth arg BUFFER non-nil means that the encoded text is
9072 inserted in that buffer after point (point does not move).  In this
9073 case, the return value is the length of the encoded text.
9074
9075 This function sets `last-coding-system-used' to the precise coding system
9076 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9077 not fully specified.)  */)
9078   (Lisp_Object string, Lisp_Object coding_system, Lisp_Object nocopy, Lisp_Object buffer)
9079 {
9080   return code_convert_string (string, coding_system, buffer,
9081                               1, ! NILP (nocopy), 1);
9082 }
9083
9084 \f
9085 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9086        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9087 Return the corresponding character.  */)
9088   (Lisp_Object code)
9089 {
9090   Lisp_Object spec, attrs, val;
9091   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9092   int c;
9093
9094   CHECK_NATNUM (code);
9095   c = XFASTINT (code);
9096   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9097   attrs = AREF (spec, 0);
9098
9099   if (ASCII_BYTE_P (c)
9100       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9101     return code;
9102
9103   val = CODING_ATTR_CHARSET_LIST (attrs);
9104   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9105   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9106   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9107
9108   if (c <= 0x7F)
9109     charset = charset_roman;
9110   else if (c >= 0xA0 && c < 0xDF)
9111     {
9112       charset = charset_kana;
9113       c -= 0x80;
9114     }
9115   else
9116     {
9117       int s1 = c >> 8, s2 = c & 0xFF;
9118
9119       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9120           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9121         error ("Invalid code: %d", code);
9122       SJIS_TO_JIS (c);
9123       charset = charset_kanji;
9124     }
9125   c = DECODE_CHAR (charset, c);
9126   if (c < 0)
9127     error ("Invalid code: %d", code);
9128   return make_number (c);
9129 }
9130
9131
9132 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9133        doc: /* Encode a Japanese character CH to shift_jis encoding.
9134 Return the corresponding code in SJIS.  */)
9135   (Lisp_Object ch)
9136 {
9137   Lisp_Object spec, attrs, charset_list;
9138   int c;
9139   struct charset *charset;
9140   unsigned code;
9141
9142   CHECK_CHARACTER (ch);
9143   c = XFASTINT (ch);
9144   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9145   attrs = AREF (spec, 0);
9146
9147   if (ASCII_CHAR_P (c)
9148       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9149     return ch;
9150
9151   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9152   charset = char_charset (c, charset_list, &code);
9153   if (code == CHARSET_INVALID_CODE (charset))
9154     error ("Can't encode by shift_jis encoding: %d", c);
9155   JIS_TO_SJIS (code);
9156
9157   return make_number (code);
9158 }
9159
9160 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9161        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9162 Return the corresponding character.  */)
9163   (Lisp_Object code)
9164 {
9165   Lisp_Object spec, attrs, val;
9166   struct charset *charset_roman, *charset_big5, *charset;
9167   int c;
9168
9169   CHECK_NATNUM (code);
9170   c = XFASTINT (code);
9171   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9172   attrs = AREF (spec, 0);
9173
9174   if (ASCII_BYTE_P (c)
9175       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9176     return code;
9177
9178   val = CODING_ATTR_CHARSET_LIST (attrs);
9179   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9180   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9181
9182   if (c <= 0x7F)
9183     charset = charset_roman;
9184   else
9185     {
9186       int b1 = c >> 8, b2 = c & 0x7F;
9187       if (b1 < 0xA1 || b1 > 0xFE
9188           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9189         error ("Invalid code: %d", code);
9190       charset = charset_big5;
9191     }
9192   c = DECODE_CHAR (charset, (unsigned )c);
9193   if (c < 0)
9194     error ("Invalid code: %d", code);
9195   return make_number (c);
9196 }
9197
9198 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9199        doc: /* Encode the Big5 character CH to BIG5 coding system.
9200 Return the corresponding character code in Big5.  */)
9201   (Lisp_Object ch)
9202 {
9203   Lisp_Object spec, attrs, charset_list;
9204   struct charset *charset;
9205   int c;
9206   unsigned code;
9207
9208   CHECK_CHARACTER (ch);
9209   c = XFASTINT (ch);
9210   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9211   attrs = AREF (spec, 0);
9212   if (ASCII_CHAR_P (c)
9213       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9214     return ch;
9215
9216   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9217   charset = char_charset (c, charset_list, &code);
9218   if (code == CHARSET_INVALID_CODE (charset))
9219     error ("Can't encode by Big5 encoding: %d", c);
9220
9221   return make_number (code);
9222 }
9223
9224 \f
9225 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9226        Sset_terminal_coding_system_internal, 1, 2, 0,
9227        doc: /* Internal use only.  */)
9228   (Lisp_Object coding_system, Lisp_Object terminal)
9229 {
9230   struct terminal *term = get_terminal (terminal, 1);
9231   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (term);
9232   CHECK_SYMBOL (coding_system);
9233   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9234   /* We had better not send unsafe characters to terminal.  */
9235   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9236   /* Character composition should be disabled.  */
9237   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9238   terminal_coding->src_multibyte = 1;
9239   terminal_coding->dst_multibyte = 0;
9240   if (terminal_coding->common_flags & CODING_REQUIRE_ENCODING_MASK)
9241     term->charset_list = coding_charset_list (terminal_coding);
9242   else
9243     term->charset_list = Fcons (make_number (charset_ascii), Qnil);
9244   return Qnil;
9245 }
9246
9247 DEFUN ("set-safe-terminal-coding-system-internal",
9248        Fset_safe_terminal_coding_system_internal,
9249        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9250        doc: /* Internal use only.  */)
9251   (Lisp_Object coding_system)
9252 {
9253   CHECK_SYMBOL (coding_system);
9254   setup_coding_system (Fcheck_coding_system (coding_system),
9255                        &safe_terminal_coding);
9256   /* Character composition should be disabled.  */
9257   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9258   safe_terminal_coding.src_multibyte = 1;
9259   safe_terminal_coding.dst_multibyte = 0;
9260   return Qnil;
9261 }
9262
9263 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9264        Sterminal_coding_system, 0, 1, 0,
9265        doc: /* Return coding system specified for terminal output on the given terminal.
9266 TERMINAL may be a terminal object, a frame, or nil for the selected
9267 frame's terminal device.  */)
9268   (Lisp_Object terminal)
9269 {
9270   struct coding_system *terminal_coding
9271     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9272   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9273
9274   /* For backward compatibility, return nil if it is `undecided'. */
9275   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9276 }
9277
9278 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9279        Sset_keyboard_coding_system_internal, 1, 2, 0,
9280        doc: /* Internal use only.  */)
9281   (Lisp_Object coding_system, Lisp_Object terminal)
9282 {
9283   struct terminal *t = get_terminal (terminal, 1);
9284   CHECK_SYMBOL (coding_system);
9285   if (NILP (coding_system))
9286     coding_system = Qno_conversion;
9287   else
9288     Fcheck_coding_system (coding_system);
9289   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9290   /* Character composition should be disabled.  */
9291   TERMINAL_KEYBOARD_CODING (t)->common_flags
9292     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9293   return Qnil;
9294 }
9295
9296 DEFUN ("keyboard-coding-system",
9297        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9298        doc: /* Return coding system specified for decoding keyboard input.  */)
9299   (Lisp_Object terminal)
9300 {
9301   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9302                          (get_terminal (terminal, 1))->id);
9303 }
9304
9305 \f
9306 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9307        Sfind_operation_coding_system,  1, MANY, 0,
9308        doc: /* Choose a coding system for an operation based on the target name.
9309 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9310 DECODING-SYSTEM is the coding system to use for decoding
9311 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9312 for encoding (in case OPERATION does encoding).
9313
9314 The first argument OPERATION specifies an I/O primitive:
9315   For file I/O, `insert-file-contents' or `write-region'.
9316   For process I/O, `call-process', `call-process-region', or `start-process'.
9317   For network I/O, `open-network-stream'.
9318
9319 The remaining arguments should be the same arguments that were passed
9320 to the primitive.  Depending on which primitive, one of those arguments
9321 is selected as the TARGET.  For example, if OPERATION does file I/O,
9322 whichever argument specifies the file name is TARGET.
9323
9324 TARGET has a meaning which depends on OPERATION:
9325   For file I/O, TARGET is a file name (except for the special case below).
9326   For process I/O, TARGET is a process name.
9327   For network I/O, TARGET is a service name or a port number.
9328
9329 This function looks up what is specified for TARGET in
9330 `file-coding-system-alist', `process-coding-system-alist',
9331 or `network-coding-system-alist' depending on OPERATION.
9332 They may specify a coding system, a cons of coding systems,
9333 or a function symbol to call.
9334 In the last case, we call the function with one argument,
9335 which is a list of all the arguments given to this function.
9336 If the function can't decide a coding system, it can return
9337 `undecided' so that the normal code-detection is performed.
9338
9339 If OPERATION is `insert-file-contents', the argument corresponding to
9340 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9341 file name to look up, and BUFFER is a buffer that contains the file's
9342 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9343 function to call for FILENAME, that function should examine the
9344 contents of BUFFER instead of reading the file.
9345
9346 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9347   (int nargs, Lisp_Object *args)
9348 {
9349   Lisp_Object operation, target_idx, target, val;
9350   register Lisp_Object chain;
9351
9352   if (nargs < 2)
9353     error ("Too few arguments");
9354   operation = args[0];
9355   if (!SYMBOLP (operation)
9356       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9357     error ("Invalid first argument");
9358   if (nargs < 1 + XINT (target_idx))
9359     error ("Too few arguments for operation: %s",
9360            SDATA (SYMBOL_NAME (operation)));
9361   target = args[XINT (target_idx) + 1];
9362   if (!(STRINGP (target)
9363         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9364             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9365         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9366     error ("Invalid %dth argument", XINT (target_idx) + 1);
9367   if (CONSP (target))
9368     target = XCAR (target);
9369
9370   chain = ((EQ (operation, Qinsert_file_contents)
9371             || EQ (operation, Qwrite_region))
9372            ? Vfile_coding_system_alist
9373            : (EQ (operation, Qopen_network_stream)
9374               ? Vnetwork_coding_system_alist
9375               : Vprocess_coding_system_alist));
9376   if (NILP (chain))
9377     return Qnil;
9378
9379   for (; CONSP (chain); chain = XCDR (chain))
9380     {
9381       Lisp_Object elt;
9382
9383       elt = XCAR (chain);
9384       if (CONSP (elt)
9385           && ((STRINGP (target)
9386                && STRINGP (XCAR (elt))
9387                && fast_string_match (XCAR (elt), target) >= 0)
9388               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9389         {
9390           val = XCDR (elt);
9391           /* Here, if VAL is both a valid coding system and a valid
9392              function symbol, we return VAL as a coding system.  */
9393           if (CONSP (val))
9394             return val;
9395           if (! SYMBOLP (val))
9396             return Qnil;
9397           if (! NILP (Fcoding_system_p (val)))
9398             return Fcons (val, val);
9399           if (! NILP (Ffboundp (val)))
9400             {
9401               /* We use call1 rather than safe_call1
9402                  so as to get bug reports about functions called here
9403                  which don't handle the current interface.  */
9404               val = call1 (val, Flist (nargs, args));
9405               if (CONSP (val))
9406                 return val;
9407               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9408                 return Fcons (val, val);
9409             }
9410           return Qnil;
9411         }
9412     }
9413   return Qnil;
9414 }
9415
9416 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9417        Sset_coding_system_priority, 0, MANY, 0,
9418        doc: /* Assign higher priority to the coding systems given as arguments.
9419 If multiple coding systems belong to the same category,
9420 all but the first one are ignored.
9421
9422 usage: (set-coding-system-priority &rest coding-systems)  */)
9423   (int nargs, Lisp_Object *args)
9424 {
9425   int i, j;
9426   int changed[coding_category_max];
9427   enum coding_category priorities[coding_category_max];
9428
9429   memset (changed, 0, sizeof changed);
9430
9431   for (i = j = 0; i < nargs; i++)
9432     {
9433       enum coding_category category;
9434       Lisp_Object spec, attrs;
9435
9436       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9437       attrs = AREF (spec, 0);
9438       category = XINT (CODING_ATTR_CATEGORY (attrs));
9439       if (changed[category])
9440         /* Ignore this coding system because a coding system of the
9441            same category already had a higher priority.  */
9442         continue;
9443       changed[category] = 1;
9444       priorities[j++] = category;
9445       if (coding_categories[category].id >= 0
9446           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9447         setup_coding_system (args[i], &coding_categories[category]);
9448       Fset (AREF (Vcoding_category_table, category), args[i]);
9449     }
9450
9451   /* Now we have decided top J priorities.  Reflect the order of the
9452      original priorities to the remaining priorities.  */
9453
9454   for (i = j, j = 0; i < coding_category_max; i++, j++)
9455     {
9456       while (j < coding_category_max
9457              && changed[coding_priorities[j]])
9458         j++;
9459       if (j == coding_category_max)
9460         abort ();
9461       priorities[i] = coding_priorities[j];
9462     }
9463
9464   memcpy (coding_priorities, priorities, sizeof priorities);
9465
9466   /* Update `coding-category-list'.  */
9467   Vcoding_category_list = Qnil;
9468   for (i = coding_category_max - 1; i >= 0; i--)
9469     Vcoding_category_list
9470       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9471                Vcoding_category_list);
9472
9473   return Qnil;
9474 }
9475
9476 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9477        Scoding_system_priority_list, 0, 1, 0,
9478        doc: /* Return a list of coding systems ordered by their priorities.
9479 The list contains a subset of coding systems; i.e. coding systems
9480 assigned to each coding category (see `coding-category-list').
9481
9482 HIGHESTP non-nil means just return the highest priority one.  */)
9483   (Lisp_Object highestp)
9484 {
9485   int i;
9486   Lisp_Object val;
9487
9488   for (i = 0, val = Qnil; i < coding_category_max; i++)
9489     {
9490       enum coding_category category = coding_priorities[i];
9491       int id = coding_categories[category].id;
9492       Lisp_Object attrs;
9493
9494       if (id < 0)
9495         continue;
9496       attrs = CODING_ID_ATTRS (id);
9497       if (! NILP (highestp))
9498         return CODING_ATTR_BASE_NAME (attrs);
9499       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9500     }
9501   return Fnreverse (val);
9502 }
9503
9504 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9505
9506 static Lisp_Object
9507 make_subsidiaries (Lisp_Object base)
9508 {
9509   Lisp_Object subsidiaries;
9510   int base_name_len = SBYTES (SYMBOL_NAME (base));
9511   char *buf = (char *) alloca (base_name_len + 6);
9512   int i;
9513
9514   memcpy (buf, SDATA (SYMBOL_NAME (base)), base_name_len);
9515   subsidiaries = Fmake_vector (make_number (3), Qnil);
9516   for (i = 0; i < 3; i++)
9517     {
9518       memcpy (buf + base_name_len, suffixes[i], strlen (suffixes[i]) + 1);
9519       ASET (subsidiaries, i, intern (buf));
9520     }
9521   return subsidiaries;
9522 }
9523
9524
9525 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9526        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9527        doc: /* For internal use only.
9528 usage: (define-coding-system-internal ...)  */)
9529   (int nargs, Lisp_Object *args)
9530 {
9531   Lisp_Object name;
9532   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9533   Lisp_Object attrs;            /* Vector of attributes.  */
9534   Lisp_Object eol_type;
9535   Lisp_Object aliases;
9536   Lisp_Object coding_type, charset_list, safe_charsets;
9537   enum coding_category category;
9538   Lisp_Object tail, val;
9539   int max_charset_id = 0;
9540   int i;
9541
9542   if (nargs < coding_arg_max)
9543     goto short_args;
9544
9545   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9546
9547   name = args[coding_arg_name];
9548   CHECK_SYMBOL (name);
9549   CODING_ATTR_BASE_NAME (attrs) = name;
9550
9551   val = args[coding_arg_mnemonic];
9552   if (! STRINGP (val))
9553     CHECK_CHARACTER (val);
9554   CODING_ATTR_MNEMONIC (attrs) = val;
9555
9556   coding_type = args[coding_arg_coding_type];
9557   CHECK_SYMBOL (coding_type);
9558   CODING_ATTR_TYPE (attrs) = coding_type;
9559
9560   charset_list = args[coding_arg_charset_list];
9561   if (SYMBOLP (charset_list))
9562     {
9563       if (EQ (charset_list, Qiso_2022))
9564         {
9565           if (! EQ (coding_type, Qiso_2022))
9566             error ("Invalid charset-list");
9567           charset_list = Viso_2022_charset_list;
9568         }
9569       else if (EQ (charset_list, Qemacs_mule))
9570         {
9571           if (! EQ (coding_type, Qemacs_mule))
9572             error ("Invalid charset-list");
9573           charset_list = Vemacs_mule_charset_list;
9574         }
9575       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9576         if (max_charset_id < XFASTINT (XCAR (tail)))
9577           max_charset_id = XFASTINT (XCAR (tail));
9578     }
9579   else
9580     {
9581       charset_list = Fcopy_sequence (charset_list);
9582       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9583         {
9584           struct charset *charset;
9585
9586           val = XCAR (tail);
9587           CHECK_CHARSET_GET_CHARSET (val, charset);
9588           if (EQ (coding_type, Qiso_2022)
9589               ? CHARSET_ISO_FINAL (charset) < 0
9590               : EQ (coding_type, Qemacs_mule)
9591               ? CHARSET_EMACS_MULE_ID (charset) < 0
9592               : 0)
9593             error ("Can't handle charset `%s'",
9594                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9595
9596           XSETCAR (tail, make_number (charset->id));
9597           if (max_charset_id < charset->id)
9598             max_charset_id = charset->id;
9599         }
9600     }
9601   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9602
9603   safe_charsets = make_uninit_string (max_charset_id + 1);
9604   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9605   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9606     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9607   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9608
9609   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9610
9611   val = args[coding_arg_decode_translation_table];
9612   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9613     CHECK_SYMBOL (val);
9614   CODING_ATTR_DECODE_TBL (attrs) = val;
9615
9616   val = args[coding_arg_encode_translation_table];
9617   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9618     CHECK_SYMBOL (val);
9619   CODING_ATTR_ENCODE_TBL (attrs) = val;
9620
9621   val = args[coding_arg_post_read_conversion];
9622   CHECK_SYMBOL (val);
9623   CODING_ATTR_POST_READ (attrs) = val;
9624
9625   val = args[coding_arg_pre_write_conversion];
9626   CHECK_SYMBOL (val);
9627   CODING_ATTR_PRE_WRITE (attrs) = val;
9628
9629   val = args[coding_arg_default_char];
9630   if (NILP (val))
9631     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9632   else
9633     {
9634       CHECK_CHARACTER (val);
9635       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9636     }
9637
9638   val = args[coding_arg_for_unibyte];
9639   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9640
9641   val = args[coding_arg_plist];
9642   CHECK_LIST (val);
9643   CODING_ATTR_PLIST (attrs) = val;
9644
9645   if (EQ (coding_type, Qcharset))
9646     {
9647       /* Generate a lisp vector of 256 elements.  Each element is nil,
9648          integer, or a list of charset IDs.
9649
9650          If Nth element is nil, the byte code N is invalid in this
9651          coding system.
9652
9653          If Nth element is a number NUM, N is the first byte of a
9654          charset whose ID is NUM.
9655
9656          If Nth element is a list of charset IDs, N is the first byte
9657          of one of them.  The list is sorted by dimensions of the
9658          charsets.  A charset of smaller dimension comes first. */
9659       val = Fmake_vector (make_number (256), Qnil);
9660
9661       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9662         {
9663           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9664           int dim = CHARSET_DIMENSION (charset);
9665           int idx = (dim - 1) * 4;
9666
9667           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9668             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9669
9670           for (i = charset->code_space[idx];
9671                i <= charset->code_space[idx + 1]; i++)
9672             {
9673               Lisp_Object tmp, tmp2;
9674               int dim2;
9675
9676               tmp = AREF (val, i);
9677               if (NILP (tmp))
9678                 tmp = XCAR (tail);
9679               else if (NUMBERP (tmp))
9680                 {
9681                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9682                   if (dim < dim2)
9683                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9684                   else
9685                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9686                 }
9687               else
9688                 {
9689                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9690                     {
9691                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9692                       if (dim < dim2)
9693                         break;
9694                     }
9695                   if (NILP (tmp2))
9696                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9697                   else
9698                     {
9699                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9700                       XSETCAR (tmp2, XCAR (tail));
9701                     }
9702                 }
9703               ASET (val, i, tmp);
9704             }
9705         }
9706       ASET (attrs, coding_attr_charset_valids, val);
9707       category = coding_category_charset;
9708     }
9709   else if (EQ (coding_type, Qccl))
9710     {
9711       Lisp_Object valids;
9712
9713       if (nargs < coding_arg_ccl_max)
9714         goto short_args;
9715
9716       val = args[coding_arg_ccl_decoder];
9717       CHECK_CCL_PROGRAM (val);
9718       if (VECTORP (val))
9719         val = Fcopy_sequence (val);
9720       ASET (attrs, coding_attr_ccl_decoder, val);
9721
9722       val = args[coding_arg_ccl_encoder];
9723       CHECK_CCL_PROGRAM (val);
9724       if (VECTORP (val))
9725         val = Fcopy_sequence (val);
9726       ASET (attrs, coding_attr_ccl_encoder, val);
9727
9728       val = args[coding_arg_ccl_valids];
9729       valids = Fmake_string (make_number (256), make_number (0));
9730       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9731         {
9732           int from, to;
9733
9734           val = Fcar (tail);
9735           if (INTEGERP (val))
9736             {
9737               from = to = XINT (val);
9738               if (from < 0 || from > 255)
9739                 args_out_of_range_3 (val, make_number (0), make_number (255));
9740             }
9741           else
9742             {
9743               CHECK_CONS (val);
9744               CHECK_NATNUM_CAR (val);
9745               CHECK_NATNUM_CDR (val);
9746               from = XINT (XCAR (val));
9747               if (from > 255)
9748                 args_out_of_range_3 (XCAR (val),
9749                                      make_number (0), make_number (255));
9750               to = XINT (XCDR (val));
9751               if (to < from || to > 255)
9752                 args_out_of_range_3 (XCDR (val),
9753                                      XCAR (val), make_number (255));
9754             }
9755           for (i = from; i <= to; i++)
9756             SSET (valids, i, 1);
9757         }
9758       ASET (attrs, coding_attr_ccl_valids, valids);
9759
9760       category = coding_category_ccl;
9761     }
9762   else if (EQ (coding_type, Qutf_16))
9763     {
9764       Lisp_Object bom, endian;
9765
9766       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9767
9768       if (nargs < coding_arg_utf16_max)
9769         goto short_args;
9770
9771       bom = args[coding_arg_utf16_bom];
9772       if (! NILP (bom) && ! EQ (bom, Qt))
9773         {
9774           CHECK_CONS (bom);
9775           val = XCAR (bom);
9776           CHECK_CODING_SYSTEM (val);
9777           val = XCDR (bom);
9778           CHECK_CODING_SYSTEM (val);
9779         }
9780       ASET (attrs, coding_attr_utf_bom, bom);
9781
9782       endian = args[coding_arg_utf16_endian];
9783       CHECK_SYMBOL (endian);
9784       if (NILP (endian))
9785         endian = Qbig;
9786       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9787         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9788       ASET (attrs, coding_attr_utf_16_endian, endian);
9789
9790       category = (CONSP (bom)
9791                   ? coding_category_utf_16_auto
9792                   : NILP (bom)
9793                   ? (EQ (endian, Qbig)
9794                      ? coding_category_utf_16_be_nosig
9795                      : coding_category_utf_16_le_nosig)
9796                   : (EQ (endian, Qbig)
9797                      ? coding_category_utf_16_be
9798                      : coding_category_utf_16_le));
9799     }
9800   else if (EQ (coding_type, Qiso_2022))
9801     {
9802       Lisp_Object initial, reg_usage, request, flags;
9803       int i;
9804
9805       if (nargs < coding_arg_iso2022_max)
9806         goto short_args;
9807
9808       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9809       CHECK_VECTOR (initial);
9810       for (i = 0; i < 4; i++)
9811         {
9812           val = Faref (initial, make_number (i));
9813           if (! NILP (val))
9814             {
9815               struct charset *charset;
9816
9817               CHECK_CHARSET_GET_CHARSET (val, charset);
9818               ASET (initial, i, make_number (CHARSET_ID (charset)));
9819               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9820                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9821             }
9822           else
9823             ASET (initial, i, make_number (-1));
9824         }
9825
9826       reg_usage = args[coding_arg_iso2022_reg_usage];
9827       CHECK_CONS (reg_usage);
9828       CHECK_NUMBER_CAR (reg_usage);
9829       CHECK_NUMBER_CDR (reg_usage);
9830
9831       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9832       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9833         {
9834           int id;
9835           Lisp_Object tmp;
9836
9837           val = Fcar (tail);
9838           CHECK_CONS (val);
9839           tmp = XCAR (val);
9840           CHECK_CHARSET_GET_ID (tmp, id);
9841           CHECK_NATNUM_CDR (val);
9842           if (XINT (XCDR (val)) >= 4)
9843             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9844           XSETCAR (val, make_number (id));
9845         }
9846
9847       flags = args[coding_arg_iso2022_flags];
9848       CHECK_NATNUM (flags);
9849       i = XINT (flags);
9850       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9851         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9852
9853       ASET (attrs, coding_attr_iso_initial, initial);
9854       ASET (attrs, coding_attr_iso_usage, reg_usage);
9855       ASET (attrs, coding_attr_iso_request, request);
9856       ASET (attrs, coding_attr_iso_flags, flags);
9857       setup_iso_safe_charsets (attrs);
9858
9859       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9860         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9861                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9862                     ? coding_category_iso_7_else
9863                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9864                     ? coding_category_iso_7
9865                     : coding_category_iso_7_tight);
9866       else
9867         {
9868           int id = XINT (AREF (initial, 1));
9869
9870           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9871                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9872                        || id < 0)
9873                       ? coding_category_iso_8_else
9874                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9875                       ? coding_category_iso_8_1
9876                       : coding_category_iso_8_2);
9877         }
9878       if (category != coding_category_iso_8_1
9879           && category != coding_category_iso_8_2)
9880         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9881     }
9882   else if (EQ (coding_type, Qemacs_mule))
9883     {
9884       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
9885         ASET (attrs, coding_attr_emacs_mule_full, Qt);
9886       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9887       category = coding_category_emacs_mule;
9888     }
9889   else if (EQ (coding_type, Qshift_jis))
9890     {
9891
9892       struct charset *charset;
9893
9894       if (XINT (Flength (charset_list)) != 3
9895           && XINT (Flength (charset_list)) != 4)
9896         error ("There should be three or four charsets");
9897
9898       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9899       if (CHARSET_DIMENSION (charset) != 1)
9900         error ("Dimension of charset %s is not one",
9901                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9902       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9903         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9904
9905       charset_list = XCDR (charset_list);
9906       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9907       if (CHARSET_DIMENSION (charset) != 1)
9908         error ("Dimension of charset %s is not one",
9909                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9910
9911       charset_list = XCDR (charset_list);
9912       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9913       if (CHARSET_DIMENSION (charset) != 2)
9914         error ("Dimension of charset %s is not two",
9915                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9916
9917       charset_list = XCDR (charset_list);
9918       if (! NILP (charset_list))
9919         {
9920           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9921           if (CHARSET_DIMENSION (charset) != 2)
9922             error ("Dimension of charset %s is not two",
9923                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9924         }
9925
9926       category = coding_category_sjis;
9927       Vsjis_coding_system = name;
9928     }
9929   else if (EQ (coding_type, Qbig5))
9930     {
9931       struct charset *charset;
9932
9933       if (XINT (Flength (charset_list)) != 2)
9934         error ("There should be just two charsets");
9935
9936       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9937       if (CHARSET_DIMENSION (charset) != 1)
9938         error ("Dimension of charset %s is not one",
9939                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9940       if (CHARSET_ASCII_COMPATIBLE_P (charset))
9941         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9942
9943       charset_list = XCDR (charset_list);
9944       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
9945       if (CHARSET_DIMENSION (charset) != 2)
9946         error ("Dimension of charset %s is not two",
9947                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9948
9949       category = coding_category_big5;
9950       Vbig5_coding_system = name;
9951     }
9952   else if (EQ (coding_type, Qraw_text))
9953     {
9954       category = coding_category_raw_text;
9955       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9956     }
9957   else if (EQ (coding_type, Qutf_8))
9958     {
9959       Lisp_Object bom;
9960
9961       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9962
9963       if (nargs < coding_arg_utf8_max)
9964         goto short_args;
9965
9966       bom = args[coding_arg_utf8_bom];
9967       if (! NILP (bom) && ! EQ (bom, Qt))
9968         {
9969           CHECK_CONS (bom);
9970           val = XCAR (bom);
9971           CHECK_CODING_SYSTEM (val);
9972           val = XCDR (bom);
9973           CHECK_CODING_SYSTEM (val);
9974         }
9975       ASET (attrs, coding_attr_utf_bom, bom);
9976
9977       category = (CONSP (bom) ? coding_category_utf_8_auto
9978                   : NILP (bom) ? coding_category_utf_8_nosig
9979                   : coding_category_utf_8_sig);
9980     }
9981   else if (EQ (coding_type, Qundecided))
9982     category = coding_category_undecided;
9983   else
9984     error ("Invalid coding system type: %s",
9985            SDATA (SYMBOL_NAME (coding_type)));
9986
9987   CODING_ATTR_CATEGORY (attrs) = make_number (category);
9988   CODING_ATTR_PLIST (attrs)
9989     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
9990                                 CODING_ATTR_PLIST (attrs)));
9991   CODING_ATTR_PLIST (attrs)
9992     = Fcons (QCascii_compatible_p,
9993              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
9994                     CODING_ATTR_PLIST (attrs)));
9995
9996   eol_type = args[coding_arg_eol_type];
9997   if (! NILP (eol_type)
9998       && ! EQ (eol_type, Qunix)
9999       && ! EQ (eol_type, Qdos)
10000       && ! EQ (eol_type, Qmac))
10001     error ("Invalid eol-type");
10002
10003   aliases = Fcons (name, Qnil);
10004
10005   if (NILP (eol_type))
10006     {
10007       eol_type = make_subsidiaries (name);
10008       for (i = 0; i < 3; i++)
10009         {
10010           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10011
10012           this_name = AREF (eol_type, i);
10013           this_aliases = Fcons (this_name, Qnil);
10014           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10015           this_spec = Fmake_vector (make_number (3), attrs);
10016           ASET (this_spec, 1, this_aliases);
10017           ASET (this_spec, 2, this_eol_type);
10018           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10019           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10020           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10021           if (NILP (val))
10022             Vcoding_system_alist
10023               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10024                        Vcoding_system_alist);
10025         }
10026     }
10027
10028   spec_vec = Fmake_vector (make_number (3), attrs);
10029   ASET (spec_vec, 1, aliases);
10030   ASET (spec_vec, 2, eol_type);
10031
10032   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10033   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10034   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10035   if (NILP (val))
10036     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10037                                   Vcoding_system_alist);
10038
10039   {
10040     int id = coding_categories[category].id;
10041
10042     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10043       setup_coding_system (name, &coding_categories[category]);
10044   }
10045
10046   return Qnil;
10047
10048  short_args:
10049   return Fsignal (Qwrong_number_of_arguments,
10050                   Fcons (intern ("define-coding-system-internal"),
10051                          make_number (nargs)));
10052 }
10053
10054
10055 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10056        3, 3, 0,
10057        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10058   (Lisp_Object coding_system, Lisp_Object prop, Lisp_Object val)
10059 {
10060   Lisp_Object spec, attrs;
10061
10062   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10063   attrs = AREF (spec, 0);
10064   if (EQ (prop, QCmnemonic))
10065     {
10066       if (! STRINGP (val))
10067         CHECK_CHARACTER (val);
10068       CODING_ATTR_MNEMONIC (attrs) = val;
10069     }
10070   else if (EQ (prop, QCdefault_char))
10071     {
10072       if (NILP (val))
10073         val = make_number (' ');
10074       else
10075         CHECK_CHARACTER (val);
10076       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10077     }
10078   else if (EQ (prop, QCdecode_translation_table))
10079     {
10080       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10081         CHECK_SYMBOL (val);
10082       CODING_ATTR_DECODE_TBL (attrs) = val;
10083     }
10084   else if (EQ (prop, QCencode_translation_table))
10085     {
10086       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10087         CHECK_SYMBOL (val);
10088       CODING_ATTR_ENCODE_TBL (attrs) = val;
10089     }
10090   else if (EQ (prop, QCpost_read_conversion))
10091     {
10092       CHECK_SYMBOL (val);
10093       CODING_ATTR_POST_READ (attrs) = val;
10094     }
10095   else if (EQ (prop, QCpre_write_conversion))
10096     {
10097       CHECK_SYMBOL (val);
10098       CODING_ATTR_PRE_WRITE (attrs) = val;
10099     }
10100   else if (EQ (prop, QCascii_compatible_p))
10101     {
10102       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10103     }
10104
10105   CODING_ATTR_PLIST (attrs)
10106     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10107   return val;
10108 }
10109
10110
10111 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10112        Sdefine_coding_system_alias, 2, 2, 0,
10113        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10114   (Lisp_Object alias, Lisp_Object coding_system)
10115 {
10116   Lisp_Object spec, aliases, eol_type, val;
10117
10118   CHECK_SYMBOL (alias);
10119   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10120   aliases = AREF (spec, 1);
10121   /* ALIASES should be a list of length more than zero, and the first
10122      element is a base coding system.  Append ALIAS at the tail of the
10123      list.  */
10124   while (!NILP (XCDR (aliases)))
10125     aliases = XCDR (aliases);
10126   XSETCDR (aliases, Fcons (alias, Qnil));
10127
10128   eol_type = AREF (spec, 2);
10129   if (VECTORP (eol_type))
10130     {
10131       Lisp_Object subsidiaries;
10132       int i;
10133
10134       subsidiaries = make_subsidiaries (alias);
10135       for (i = 0; i < 3; i++)
10136         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10137                                      AREF (eol_type, i));
10138     }
10139
10140   Fputhash (alias, spec, Vcoding_system_hash_table);
10141   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10142   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10143   if (NILP (val))
10144     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10145                                   Vcoding_system_alist);
10146
10147   return Qnil;
10148 }
10149
10150 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10151        1, 1, 0,
10152        doc: /* Return the base of CODING-SYSTEM.
10153 Any alias or subsidiary coding system is not a base coding system.  */)
10154   (Lisp_Object coding_system)
10155 {
10156   Lisp_Object spec, attrs;
10157
10158   if (NILP (coding_system))
10159     return (Qno_conversion);
10160   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10161   attrs = AREF (spec, 0);
10162   return CODING_ATTR_BASE_NAME (attrs);
10163 }
10164
10165 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10166        1, 1, 0,
10167        doc: "Return the property list of CODING-SYSTEM.")
10168   (Lisp_Object coding_system)
10169 {
10170   Lisp_Object spec, attrs;
10171
10172   if (NILP (coding_system))
10173     coding_system = Qno_conversion;
10174   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10175   attrs = AREF (spec, 0);
10176   return CODING_ATTR_PLIST (attrs);
10177 }
10178
10179
10180 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10181        1, 1, 0,
10182        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10183   (Lisp_Object coding_system)
10184 {
10185   Lisp_Object spec;
10186
10187   if (NILP (coding_system))
10188     coding_system = Qno_conversion;
10189   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10190   return AREF (spec, 1);
10191 }
10192
10193 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10194        Scoding_system_eol_type, 1, 1, 0,
10195        doc: /* Return eol-type of CODING-SYSTEM.
10196 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10197
10198 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10199 and CR respectively.
10200
10201 A vector value indicates that a format of end-of-line should be
10202 detected automatically.  Nth element of the vector is the subsidiary
10203 coding system whose eol-type is N.  */)
10204   (Lisp_Object coding_system)
10205 {
10206   Lisp_Object spec, eol_type;
10207   int n;
10208
10209   if (NILP (coding_system))
10210     coding_system = Qno_conversion;
10211   if (! CODING_SYSTEM_P (coding_system))
10212     return Qnil;
10213   spec = CODING_SYSTEM_SPEC (coding_system);
10214   eol_type = AREF (spec, 2);
10215   if (VECTORP (eol_type))
10216     return Fcopy_sequence (eol_type);
10217   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10218   return make_number (n);
10219 }
10220
10221 #endif /* emacs */
10222
10223 \f
10224 /*** 9. Post-amble ***/
10225
10226 void
10227 init_coding_once (void)
10228 {
10229   int i;
10230
10231   for (i = 0; i < coding_category_max; i++)
10232     {
10233       coding_categories[i].id = -1;
10234       coding_priorities[i] = i;
10235     }
10236
10237   /* ISO2022 specific initialize routine.  */
10238   for (i = 0; i < 0x20; i++)
10239     iso_code_class[i] = ISO_control_0;
10240   for (i = 0x21; i < 0x7F; i++)
10241     iso_code_class[i] = ISO_graphic_plane_0;
10242   for (i = 0x80; i < 0xA0; i++)
10243     iso_code_class[i] = ISO_control_1;
10244   for (i = 0xA1; i < 0xFF; i++)
10245     iso_code_class[i] = ISO_graphic_plane_1;
10246   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10247   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10248   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10249   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10250   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10251   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10252   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10253   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10254   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10255
10256   for (i = 0; i < 256; i++)
10257     {
10258       emacs_mule_bytes[i] = 1;
10259     }
10260   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10261   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10262   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10263   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10264 }
10265
10266 #ifdef emacs
10267
10268 void
10269 syms_of_coding (void)
10270 {
10271   staticpro (&Vcoding_system_hash_table);
10272   {
10273     Lisp_Object args[2];
10274     args[0] = QCtest;
10275     args[1] = Qeq;
10276     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10277   }
10278
10279   staticpro (&Vsjis_coding_system);
10280   Vsjis_coding_system = Qnil;
10281
10282   staticpro (&Vbig5_coding_system);
10283   Vbig5_coding_system = Qnil;
10284
10285   staticpro (&Vcode_conversion_reused_workbuf);
10286   Vcode_conversion_reused_workbuf = Qnil;
10287
10288   staticpro (&Vcode_conversion_workbuf_name);
10289   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10290
10291   reused_workbuf_in_use = 0;
10292
10293   DEFSYM (Qcharset, "charset");
10294   DEFSYM (Qtarget_idx, "target-idx");
10295   DEFSYM (Qcoding_system_history, "coding-system-history");
10296   Fset (Qcoding_system_history, Qnil);
10297
10298   /* Target FILENAME is the first argument.  */
10299   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10300   /* Target FILENAME is the third argument.  */
10301   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10302
10303   DEFSYM (Qcall_process, "call-process");
10304   /* Target PROGRAM is the first argument.  */
10305   Fput (Qcall_process, Qtarget_idx, make_number (0));
10306
10307   DEFSYM (Qcall_process_region, "call-process-region");
10308   /* Target PROGRAM is the third argument.  */
10309   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10310
10311   DEFSYM (Qstart_process, "start-process");
10312   /* Target PROGRAM is the third argument.  */
10313   Fput (Qstart_process, Qtarget_idx, make_number (2));
10314
10315   DEFSYM (Qopen_network_stream, "open-network-stream");
10316   /* Target SERVICE is the fourth argument.  */
10317   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10318
10319   DEFSYM (Qcoding_system, "coding-system");
10320   DEFSYM (Qcoding_aliases, "coding-aliases");
10321
10322   DEFSYM (Qeol_type, "eol-type");
10323   DEFSYM (Qunix, "unix");
10324   DEFSYM (Qdos, "dos");
10325
10326   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10327   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10328   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10329   DEFSYM (Qdefault_char, "default-char");
10330   DEFSYM (Qundecided, "undecided");
10331   DEFSYM (Qno_conversion, "no-conversion");
10332   DEFSYM (Qraw_text, "raw-text");
10333
10334   DEFSYM (Qiso_2022, "iso-2022");
10335
10336   DEFSYM (Qutf_8, "utf-8");
10337   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10338
10339   DEFSYM (Qutf_16, "utf-16");
10340   DEFSYM (Qbig, "big");
10341   DEFSYM (Qlittle, "little");
10342
10343   DEFSYM (Qshift_jis, "shift-jis");
10344   DEFSYM (Qbig5, "big5");
10345
10346   DEFSYM (Qcoding_system_p, "coding-system-p");
10347
10348   DEFSYM (Qcoding_system_error, "coding-system-error");
10349   Fput (Qcoding_system_error, Qerror_conditions,
10350         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10351   Fput (Qcoding_system_error, Qerror_message,
10352         make_pure_c_string ("Invalid coding system"));
10353
10354   /* Intern this now in case it isn't already done.
10355      Setting this variable twice is harmless.
10356      But don't staticpro it here--that is done in alloc.c.  */
10357   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10358
10359   DEFSYM (Qtranslation_table, "translation-table");
10360   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10361   DEFSYM (Qtranslation_table_id, "translation-table-id");
10362   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10363   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10364
10365   DEFSYM (Qvalid_codes, "valid-codes");
10366
10367   DEFSYM (Qemacs_mule, "emacs-mule");
10368
10369   DEFSYM (QCcategory, ":category");
10370   DEFSYM (QCmnemonic, ":mnemonic");
10371   DEFSYM (QCdefault_char, ":default-char");
10372   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10373   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10374   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10375   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10376   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10377
10378   Vcoding_category_table
10379     = Fmake_vector (make_number (coding_category_max), Qnil);
10380   staticpro (&Vcoding_category_table);
10381   /* Followings are target of code detection.  */
10382   ASET (Vcoding_category_table, coding_category_iso_7,
10383         intern_c_string ("coding-category-iso-7"));
10384   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10385         intern_c_string ("coding-category-iso-7-tight"));
10386   ASET (Vcoding_category_table, coding_category_iso_8_1,
10387         intern_c_string ("coding-category-iso-8-1"));
10388   ASET (Vcoding_category_table, coding_category_iso_8_2,
10389         intern_c_string ("coding-category-iso-8-2"));
10390   ASET (Vcoding_category_table, coding_category_iso_7_else,
10391         intern_c_string ("coding-category-iso-7-else"));
10392   ASET (Vcoding_category_table, coding_category_iso_8_else,
10393         intern_c_string ("coding-category-iso-8-else"));
10394   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10395         intern_c_string ("coding-category-utf-8-auto"));
10396   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10397         intern_c_string ("coding-category-utf-8"));
10398   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10399         intern_c_string ("coding-category-utf-8-sig"));
10400   ASET (Vcoding_category_table, coding_category_utf_16_be,
10401         intern_c_string ("coding-category-utf-16-be"));
10402   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10403         intern_c_string ("coding-category-utf-16-auto"));
10404   ASET (Vcoding_category_table, coding_category_utf_16_le,
10405         intern_c_string ("coding-category-utf-16-le"));
10406   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10407         intern_c_string ("coding-category-utf-16-be-nosig"));
10408   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10409         intern_c_string ("coding-category-utf-16-le-nosig"));
10410   ASET (Vcoding_category_table, coding_category_charset,
10411         intern_c_string ("coding-category-charset"));
10412   ASET (Vcoding_category_table, coding_category_sjis,
10413         intern_c_string ("coding-category-sjis"));
10414   ASET (Vcoding_category_table, coding_category_big5,
10415         intern_c_string ("coding-category-big5"));
10416   ASET (Vcoding_category_table, coding_category_ccl,
10417         intern_c_string ("coding-category-ccl"));
10418   ASET (Vcoding_category_table, coding_category_emacs_mule,
10419         intern_c_string ("coding-category-emacs-mule"));
10420   /* Followings are NOT target of code detection.  */
10421   ASET (Vcoding_category_table, coding_category_raw_text,
10422         intern_c_string ("coding-category-raw-text"));
10423   ASET (Vcoding_category_table, coding_category_undecided,
10424         intern_c_string ("coding-category-undecided"));
10425
10426   DEFSYM (Qinsufficient_source, "insufficient-source");
10427   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10428   DEFSYM (Qinvalid_source, "invalid-source");
10429   DEFSYM (Qinterrupted, "interrupted");
10430   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10431   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10432
10433   defsubr (&Scoding_system_p);
10434   defsubr (&Sread_coding_system);
10435   defsubr (&Sread_non_nil_coding_system);
10436   defsubr (&Scheck_coding_system);
10437   defsubr (&Sdetect_coding_region);
10438   defsubr (&Sdetect_coding_string);
10439   defsubr (&Sfind_coding_systems_region_internal);
10440   defsubr (&Sunencodable_char_position);
10441   defsubr (&Scheck_coding_systems_region);
10442   defsubr (&Sdecode_coding_region);
10443   defsubr (&Sencode_coding_region);
10444   defsubr (&Sdecode_coding_string);
10445   defsubr (&Sencode_coding_string);
10446   defsubr (&Sdecode_sjis_char);
10447   defsubr (&Sencode_sjis_char);
10448   defsubr (&Sdecode_big5_char);
10449   defsubr (&Sencode_big5_char);
10450   defsubr (&Sset_terminal_coding_system_internal);
10451   defsubr (&Sset_safe_terminal_coding_system_internal);
10452   defsubr (&Sterminal_coding_system);
10453   defsubr (&Sset_keyboard_coding_system_internal);
10454   defsubr (&Skeyboard_coding_system);
10455   defsubr (&Sfind_operation_coding_system);
10456   defsubr (&Sset_coding_system_priority);
10457   defsubr (&Sdefine_coding_system_internal);
10458   defsubr (&Sdefine_coding_system_alias);
10459   defsubr (&Scoding_system_put);
10460   defsubr (&Scoding_system_base);
10461   defsubr (&Scoding_system_plist);
10462   defsubr (&Scoding_system_aliases);
10463   defsubr (&Scoding_system_eol_type);
10464   defsubr (&Scoding_system_priority_list);
10465
10466   DEFVAR_LISP ("coding-system-list", Vcoding_system_list,
10467                doc: /* List of coding systems.
10468
10469 Do not alter the value of this variable manually.  This variable should be
10470 updated by the functions `define-coding-system' and
10471 `define-coding-system-alias'.  */);
10472   Vcoding_system_list = Qnil;
10473
10474   DEFVAR_LISP ("coding-system-alist", Vcoding_system_alist,
10475                doc: /* Alist of coding system names.
10476 Each element is one element list of coding system name.
10477 This variable is given to `completing-read' as COLLECTION argument.
10478
10479 Do not alter the value of this variable manually.  This variable should be
10480 updated by the functions `make-coding-system' and
10481 `define-coding-system-alias'.  */);
10482   Vcoding_system_alist = Qnil;
10483
10484   DEFVAR_LISP ("coding-category-list", Vcoding_category_list,
10485                doc: /* List of coding-categories (symbols) ordered by priority.
10486
10487 On detecting a coding system, Emacs tries code detection algorithms
10488 associated with each coding-category one by one in this order.  When
10489 one algorithm agrees with a byte sequence of source text, the coding
10490 system bound to the corresponding coding-category is selected.
10491
10492 Don't modify this variable directly, but use `set-coding-system-priority'.  */);
10493   {
10494     int i;
10495
10496     Vcoding_category_list = Qnil;
10497     for (i = coding_category_max - 1; i >= 0; i--)
10498       Vcoding_category_list
10499         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10500                  Vcoding_category_list);
10501   }
10502
10503   DEFVAR_LISP ("coding-system-for-read", Vcoding_system_for_read,
10504                doc: /* Specify the coding system for read operations.
10505 It is useful to bind this variable with `let', but do not set it globally.
10506 If the value is a coding system, it is used for decoding on read operation.
10507 If not, an appropriate element is used from one of the coding system alists.
10508 There are three such tables: `file-coding-system-alist',
10509 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10510   Vcoding_system_for_read = Qnil;
10511
10512   DEFVAR_LISP ("coding-system-for-write", Vcoding_system_for_write,
10513                doc: /* Specify the coding system for write operations.
10514 Programs bind this variable with `let', but you should not set it globally.
10515 If the value is a coding system, it is used for encoding of output,
10516 when writing it to a file and when sending it to a file or subprocess.
10517
10518 If this does not specify a coding system, an appropriate element
10519 is used from one of the coding system alists.
10520 There are three such tables: `file-coding-system-alist',
10521 `process-coding-system-alist', and `network-coding-system-alist'.
10522 For output to files, if the above procedure does not specify a coding system,
10523 the value of `buffer-file-coding-system' is used.  */);
10524   Vcoding_system_for_write = Qnil;
10525
10526   DEFVAR_LISP ("last-coding-system-used", Vlast_coding_system_used,
10527                doc: /*
10528 Coding system used in the latest file or process I/O.  */);
10529   Vlast_coding_system_used = Qnil;
10530
10531   DEFVAR_LISP ("last-code-conversion-error", Vlast_code_conversion_error,
10532                doc: /*
10533 Error status of the last code conversion.
10534
10535 When an error was detected in the last code conversion, this variable
10536 is set to one of the following symbols.
10537   `insufficient-source'
10538   `inconsistent-eol'
10539   `invalid-source'
10540   `interrupted'
10541   `insufficient-memory'
10542 When no error was detected, the value doesn't change.  So, to check
10543 the error status of a code conversion by this variable, you must
10544 explicitly set this variable to nil before performing code
10545 conversion.  */);
10546   Vlast_code_conversion_error = Qnil;
10547
10548   DEFVAR_BOOL ("inhibit-eol-conversion", inhibit_eol_conversion,
10549                doc: /*
10550 *Non-nil means always inhibit code conversion of end-of-line format.
10551 See info node `Coding Systems' and info node `Text and Binary' concerning
10552 such conversion.  */);
10553   inhibit_eol_conversion = 0;
10554
10555   DEFVAR_BOOL ("inherit-process-coding-system", inherit_process_coding_system,
10556                doc: /*
10557 Non-nil means process buffer inherits coding system of process output.
10558 Bind it to t if the process output is to be treated as if it were a file
10559 read from some filesystem.  */);
10560   inherit_process_coding_system = 0;
10561
10562   DEFVAR_LISP ("file-coding-system-alist", Vfile_coding_system_alist,
10563                doc: /*
10564 Alist to decide a coding system to use for a file I/O operation.
10565 The format is ((PATTERN . VAL) ...),
10566 where PATTERN is a regular expression matching a file name,
10567 VAL is a coding system, a cons of coding systems, or a function symbol.
10568 If VAL is a coding system, it is used for both decoding and encoding
10569 the file contents.
10570 If VAL is a cons of coding systems, the car part is used for decoding,
10571 and the cdr part is used for encoding.
10572 If VAL is a function symbol, the function must return a coding system
10573 or a cons of coding systems which are used as above.  The function is
10574 called with an argument that is a list of the arguments with which
10575 `find-operation-coding-system' was called.  If the function can't decide
10576 a coding system, it can return `undecided' so that the normal
10577 code-detection is performed.
10578
10579 See also the function `find-operation-coding-system'
10580 and the variable `auto-coding-alist'.  */);
10581   Vfile_coding_system_alist = Qnil;
10582
10583   DEFVAR_LISP ("process-coding-system-alist", Vprocess_coding_system_alist,
10584                doc: /*
10585 Alist to decide a coding system to use for a process I/O operation.
10586 The format is ((PATTERN . VAL) ...),
10587 where PATTERN is a regular expression matching a program name,
10588 VAL is a coding system, a cons of coding systems, or a function symbol.
10589 If VAL is a coding system, it is used for both decoding what received
10590 from the program and encoding what sent to the program.
10591 If VAL is a cons of coding systems, the car part is used for decoding,
10592 and the cdr part is used for encoding.
10593 If VAL is a function symbol, the function must return a coding system
10594 or a cons of coding systems which are used as above.
10595
10596 See also the function `find-operation-coding-system'.  */);
10597   Vprocess_coding_system_alist = Qnil;
10598
10599   DEFVAR_LISP ("network-coding-system-alist", Vnetwork_coding_system_alist,
10600                doc: /*
10601 Alist to decide a coding system to use for a network I/O operation.
10602 The format is ((PATTERN . VAL) ...),
10603 where PATTERN is a regular expression matching a network service name
10604 or is a port number to connect to,
10605 VAL is a coding system, a cons of coding systems, or a function symbol.
10606 If VAL is a coding system, it is used for both decoding what received
10607 from the network stream and encoding what sent to the network stream.
10608 If VAL is a cons of coding systems, the car part is used for decoding,
10609 and the cdr part is used for encoding.
10610 If VAL is a function symbol, the function must return a coding system
10611 or a cons of coding systems which are used as above.
10612
10613 See also the function `find-operation-coding-system'.  */);
10614   Vnetwork_coding_system_alist = Qnil;
10615
10616   DEFVAR_LISP ("locale-coding-system", Vlocale_coding_system,
10617                doc: /* Coding system to use with system messages.
10618 Also used for decoding keyboard input on X Window system.  */);
10619   Vlocale_coding_system = Qnil;
10620
10621   /* The eol mnemonics are reset in startup.el system-dependently.  */
10622   DEFVAR_LISP ("eol-mnemonic-unix", eol_mnemonic_unix,
10623                doc: /*
10624 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10625   eol_mnemonic_unix = make_pure_c_string (":");
10626
10627   DEFVAR_LISP ("eol-mnemonic-dos", eol_mnemonic_dos,
10628                doc: /*
10629 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10630   eol_mnemonic_dos = make_pure_c_string ("\\");
10631
10632   DEFVAR_LISP ("eol-mnemonic-mac", eol_mnemonic_mac,
10633                doc: /*
10634 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10635   eol_mnemonic_mac = make_pure_c_string ("/");
10636
10637   DEFVAR_LISP ("eol-mnemonic-undecided", eol_mnemonic_undecided,
10638                doc: /*
10639 *String displayed in mode line when end-of-line format is not yet determined.  */);
10640   eol_mnemonic_undecided = make_pure_c_string (":");
10641
10642   DEFVAR_LISP ("enable-character-translation", Venable_character_translation,
10643                doc: /*
10644 *Non-nil enables character translation while encoding and decoding.  */);
10645   Venable_character_translation = Qt;
10646
10647   DEFVAR_LISP ("standard-translation-table-for-decode",
10648                Vstandard_translation_table_for_decode,
10649                doc: /* Table for translating characters while decoding.  */);
10650   Vstandard_translation_table_for_decode = Qnil;
10651
10652   DEFVAR_LISP ("standard-translation-table-for-encode",
10653                Vstandard_translation_table_for_encode,
10654                doc: /* Table for translating characters while encoding.  */);
10655   Vstandard_translation_table_for_encode = Qnil;
10656
10657   DEFVAR_LISP ("charset-revision-table", Vcharset_revision_table,
10658                doc: /* Alist of charsets vs revision numbers.
10659 While encoding, if a charset (car part of an element) is found,
10660 designate it with the escape sequence identifying revision (cdr part
10661 of the element).  */);
10662   Vcharset_revision_table = Qnil;
10663
10664   DEFVAR_LISP ("default-process-coding-system",
10665                Vdefault_process_coding_system,
10666                doc: /* Cons of coding systems used for process I/O by default.
10667 The car part is used for decoding a process output,
10668 the cdr part is used for encoding a text to be sent to a process.  */);
10669   Vdefault_process_coding_system = Qnil;
10670
10671   DEFVAR_LISP ("latin-extra-code-table", Vlatin_extra_code_table,
10672                doc: /*
10673 Table of extra Latin codes in the range 128..159 (inclusive).
10674 This is a vector of length 256.
10675 If Nth element is non-nil, the existence of code N in a file
10676 \(or output of subprocess) doesn't prevent it to be detected as
10677 a coding system of ISO 2022 variant which has a flag
10678 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10679 or reading output of a subprocess.
10680 Only 128th through 159th elements have a meaning.  */);
10681   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10682
10683   DEFVAR_LISP ("select-safe-coding-system-function",
10684                Vselect_safe_coding_system_function,
10685                doc: /*
10686 Function to call to select safe coding system for encoding a text.
10687
10688 If set, this function is called to force a user to select a proper
10689 coding system which can encode the text in the case that a default
10690 coding system used in each operation can't encode the text.  The
10691 function should take care that the buffer is not modified while
10692 the coding system is being selected.
10693
10694 The default value is `select-safe-coding-system' (which see).  */);
10695   Vselect_safe_coding_system_function = Qnil;
10696
10697   DEFVAR_BOOL ("coding-system-require-warning",
10698                coding_system_require_warning,
10699                doc: /* Internal use only.
10700 If non-nil, on writing a file, `select-safe-coding-system-function' is
10701 called even if `coding-system-for-write' is non-nil.  The command
10702 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10703   coding_system_require_warning = 0;
10704
10705
10706   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10707                inhibit_iso_escape_detection,
10708                doc: /*
10709 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10710
10711 When Emacs reads text, it tries to detect how the text is encoded.
10712 This code detection is sensitive to escape sequences.  If Emacs sees
10713 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10714 of the ISO2022 encodings, and decodes text by the corresponding coding
10715 system (e.g. `iso-2022-7bit').
10716
10717 However, there may be a case that you want to read escape sequences in
10718 a file as is.  In such a case, you can set this variable to non-nil.
10719 Then the code detection will ignore any escape sequences, and no text is
10720 detected as encoded in some ISO-2022 encoding.  The result is that all
10721 escape sequences become visible in a buffer.
10722
10723 The default value is nil, and it is strongly recommended not to change
10724 it.  That is because many Emacs Lisp source files that contain
10725 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10726 in Emacs's distribution, and they won't be decoded correctly on
10727 reading if you suppress escape sequence detection.
10728
10729 The other way to read escape sequences in a file without decoding is
10730 to explicitly specify some coding system that doesn't use ISO-2022
10731 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10732   inhibit_iso_escape_detection = 0;
10733
10734   DEFVAR_BOOL ("inhibit-null-byte-detection",
10735                inhibit_null_byte_detection,
10736                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10737 By default, Emacs treats it as binary data, and does not attempt to
10738 decode it.  The effect is as if you specified `no-conversion' for
10739 reading that text.
10740
10741 Set this to non-nil when a regular text happens to include null bytes.
10742 Examples are Index nodes of Info files and null-byte delimited output
10743 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10744 decode text as usual.  */);
10745   inhibit_null_byte_detection = 0;
10746
10747   DEFVAR_LISP ("translation-table-for-input", Vtranslation_table_for_input,
10748                doc: /* Char table for translating self-inserting characters.
10749 This is applied to the result of input methods, not their input.
10750 See also `keyboard-translate-table'.
10751
10752 Use of this variable for character code unification was rendered
10753 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10754 internal character representation.  */);
10755     Vtranslation_table_for_input = Qnil;
10756
10757   {
10758     Lisp_Object args[coding_arg_max];
10759     Lisp_Object plist[16];
10760     int i;
10761
10762     for (i = 0; i < coding_arg_max; i++)
10763       args[i] = Qnil;
10764
10765     plist[0] = intern_c_string (":name");
10766     plist[1] = args[coding_arg_name] = Qno_conversion;
10767     plist[2] = intern_c_string (":mnemonic");
10768     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10769     plist[4] = intern_c_string (":coding-type");
10770     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10771     plist[6] = intern_c_string (":ascii-compatible-p");
10772     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10773     plist[8] = intern_c_string (":default-char");
10774     plist[9] = args[coding_arg_default_char] = make_number (0);
10775     plist[10] = intern_c_string (":for-unibyte");
10776     plist[11] = args[coding_arg_for_unibyte] = Qt;
10777     plist[12] = intern_c_string (":docstring");
10778     plist[13] = make_pure_c_string ("Do no conversion.\n\
10779 \n\
10780 When you visit a file with this coding, the file is read into a\n\
10781 unibyte buffer as is, thus each byte of a file is treated as a\n\
10782 character.");
10783     plist[14] = intern_c_string (":eol-type");
10784     plist[15] = args[coding_arg_eol_type] = Qunix;
10785     args[coding_arg_plist] = Flist (16, plist);
10786     Fdefine_coding_system_internal (coding_arg_max, args);
10787
10788     plist[1] = args[coding_arg_name] = Qundecided;
10789     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10790     plist[5] = args[coding_arg_coding_type] = Qundecided;
10791     /* This is already set.
10792        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10793     plist[8] = intern_c_string (":charset-list");
10794     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10795     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10796     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10797     plist[15] = args[coding_arg_eol_type] = Qnil;
10798     args[coding_arg_plist] = Flist (16, plist);
10799     Fdefine_coding_system_internal (coding_arg_max, args);
10800   }
10801
10802   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10803
10804   {
10805     int i;
10806
10807     for (i = 0; i < coding_category_max; i++)
10808       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10809   }
10810 #if defined (DOS_NT)
10811   system_eol_type = Qdos;
10812 #else
10813   system_eol_type = Qunix;
10814 #endif
10815   staticpro (&system_eol_type);
10816 }
10817
10818 char *
10819 emacs_strerror (int error_number)
10820 {
10821   char *str;
10822
10823   synchronize_system_messages_locale ();
10824   str = strerror (error_number);
10825
10826   if (! NILP (Vlocale_coding_system))
10827     {
10828       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10829                                                       Vlocale_coding_system,
10830                                                       0);
10831       str = SSDATA (dec);
10832     }
10833
10834   return str;
10835 }
10836
10837 #endif /* emacs */