src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292 #include <setjmp.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302 #include "frame.h"
 303 #include "termhooks.h"
 304
 305 Lisp_Object Vcoding_system_hash_table;
 306
 307 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 308 Lisp_Object Qunix, Qdos;
 309 extern Lisp_Object Qmac;        /* frame.c */
 310 Lisp_Object Qbuffer_file_coding_system;
 311 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 312 Lisp_Object Qdefault_char;
 313 Lisp_Object Qno_conversion, Qundecided;
 314 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 315 Lisp_Object Qbig, Qlittle;
 316 Lisp_Object Qcoding_system_history;
 317 Lisp_Object Qvalid_codes;
 318 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 319 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 320 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 321 Lisp_Object QCascii_compatible_p;
 322
 323 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 324 Lisp_Object Qcall_process, Qcall_process_region;
 325 Lisp_Object Qstart_process, Qopen_network_stream;
 326 Lisp_Object Qtarget_idx;
 327
 328 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 329 Lisp_Object Qinterrupted, Qinsufficient_memory;
 330
 331 extern Lisp_Object Qcompletion_ignore_case;
 332
 333 /* If a symbol has this property, evaluate the value to define the
 334    symbol as a coding system.  */
 335 static Lisp_Object Qcoding_system_define_form;
 336
 337 int coding_system_require_warning;
 338
 339 Lisp_Object Vselect_safe_coding_system_function;
 340
 341 /* Mnemonic string for each format of end-of-line.  */
 342 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 343 /* Mnemonic string to indicate format of end-of-line is not yet
 344    decided.  */
 345 Lisp_Object eol_mnemonic_undecided;
 346
 347 /* Format of end-of-line decided by system.  This is Qunix on
 348    Unix and Mac, Qdos on DOS/Windows.
 349    This has an effect only for external encoding (i.e. for output to
 350    file and process), not for in-buffer or Lisp string encoding.  */
 351 static Lisp_Object system_eol_type;
 352
 353 #ifdef emacs
 354
 355 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 356
 357 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 358
 359 /* Coding system emacs-mule and raw-text are for converting only
 360    end-of-line format.  */
 361 Lisp_Object Qemacs_mule, Qraw_text;
 362 Lisp_Object Qutf_8_emacs;
 363
 364 /* Coding-systems are handed between Emacs Lisp programs and C internal
 365    routines by the following three variables.  */
 366 /* Coding-system for reading files and receiving data from process.  */
 367 Lisp_Object Vcoding_system_for_read;
 368 /* Coding-system for writing files and sending data to process.  */
 369 Lisp_Object Vcoding_system_for_write;
 370 /* Coding-system actually used in the latest I/O.  */
 371 Lisp_Object Vlast_coding_system_used;
 372 /* Set to non-nil when an error is detected while code conversion.  */
 373 Lisp_Object Vlast_code_conversion_error;
 374 /* A vector of length 256 which contains information about special
 375    Latin codes (especially for dealing with Microsoft codes).  */
 376 Lisp_Object Vlatin_extra_code_table;
 377
 378 /* Flag to inhibit code conversion of end-of-line format.  */
 379 int inhibit_eol_conversion;
 380
 381 /* Flag to inhibit ISO2022 escape sequence detection.  */
 382 int inhibit_iso_escape_detection;
 383
 384 /* Flag to inhibit detection of binary files through null bytes.  */
 385 int inhibit_null_byte_detection;
 386
 387 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 388 int inherit_process_coding_system;
 389
 390 /* Coding system to be used to encode text for terminal display when
 391    terminal coding system is nil.  */
 392 struct coding_system safe_terminal_coding;
 393
 394 Lisp_Object Vfile_coding_system_alist;
 395 Lisp_Object Vprocess_coding_system_alist;
 396 Lisp_Object Vnetwork_coding_system_alist;
 397
 398 Lisp_Object Vlocale_coding_system;
 399
 400 #endif /* emacs */
 401
 402 /* Flag to tell if we look up translation table on character code
 403    conversion.  */
 404 Lisp_Object Venable_character_translation;
 405 /* Standard translation table to look up on decoding (reading).  */
 406 Lisp_Object Vstandard_translation_table_for_decode;
 407 /* Standard translation table to look up on encoding (writing).  */
 408 Lisp_Object Vstandard_translation_table_for_encode;
 409
 410 Lisp_Object Qtranslation_table;
 411 Lisp_Object Qtranslation_table_id;
 412 Lisp_Object Qtranslation_table_for_decode;
 413 Lisp_Object Qtranslation_table_for_encode;
 414
 415 /* Alist of charsets vs revision number.  */
 416 static Lisp_Object Vcharset_revision_table;
 417
 418 /* Default coding systems used for process I/O.  */
 419 Lisp_Object Vdefault_process_coding_system;
 420
 421 /* Char table for translating Quail and self-inserting input.  */
 422 Lisp_Object Vtranslation_table_for_input;
 423
 424 /* Two special coding systems.  */
 425 Lisp_Object Vsjis_coding_system;
 426 Lisp_Object Vbig5_coding_system;
 427
 428 /* ISO2022 section */
 429
 430 #define CODING_ISO_INITIAL(coding, reg)                 \
 431   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 432                      coding_attr_iso_initial),          \
 433                reg)))
 434
 435
 436 #define CODING_ISO_REQUEST(coding, charset_id)          \
 437   (((charset_id) <= (coding)->max_charset_id            \
 438     ? ((coding)->safe_charsets[charset_id] != 255       \
 439        ? (coding)->safe_charsets[charset_id]            \
 440        : -1)                                            \
 441     : -1))
 442
 443
 444 #define CODING_ISO_FLAGS(coding)        \
 445   ((coding)->spec.iso_2022.flags)
 446 #define CODING_ISO_DESIGNATION(coding, reg)     \
 447   ((coding)->spec.iso_2022.current_designation[reg])
 448 #define CODING_ISO_INVOCATION(coding, plane)    \
 449   ((coding)->spec.iso_2022.current_invocation[plane])
 450 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 451   ((coding)->spec.iso_2022.single_shifting)
 452 #define CODING_ISO_BOL(coding)  \
 453   ((coding)->spec.iso_2022.bol)
 454 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 455   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 456 #define CODING_ISO_CMP_STATUS(coding)   \
 457   (&(coding)->spec.iso_2022.cmp_status)
 458 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 459   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 460 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 461   ((coding)->spec.iso_2022.embedded_utf_8)
 462
 463 /* Control characters of ISO2022.  */
 464                         /* code */      /* function */
 465 #define ISO_CODE_LF     0x0A            /* line-feed */
 466 #define ISO_CODE_CR     0x0D            /* carriage-return */
 467 #define ISO_CODE_SO     0x0E            /* shift-out */
 468 #define ISO_CODE_SI     0x0F            /* shift-in */
 469 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 470 #define ISO_CODE_ESC    0x1B            /* escape */
 471 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 472 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 473 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 474
 475 /* All code (1-byte) of ISO2022 is classified into one of the
 476    followings.  */
 477 enum iso_code_class_type
 478   {
 479     ISO_control_0,              /* Control codes in the range
 480                                    0x00..0x1F and 0x7F, except for the
 481                                    following 5 codes.  */
 482     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 483     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 484     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 485     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 486     ISO_control_1,              /* Control codes in the range
 487                                    0x80..0x9F, except for the
 488                                    following 3 codes.  */
 489     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 490     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 491     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 492     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 493     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 494     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 495     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 496   };
 497
 498 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 499     `iso-flags' attribute of an iso2022 coding system.  */
 500
 501 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 502    instead of the correct short-form sequence (e.g. ESC $ A).  */
 503 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 504
 505 /* If set, reset graphic planes and registers at end-of-line to the
 506    initial state.  */
 507 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 508
 509 /* If set, reset graphic planes and registers before any control
 510    characters to the initial state.  */
 511 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 512
 513 /* If set, encode by 7-bit environment.  */
 514 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 515
 516 /* If set, use locking-shift function.  */
 517 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 518
 519 /* If set, use single-shift function.  Overwrite
 520    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 521 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 522
 523 /* If set, use designation escape sequence.  */
 524 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 525
 526 /* If set, produce revision number sequence.  */
 527 #define CODING_ISO_FLAG_REVISION        0x0080
 528
 529 /* If set, produce ISO6429's direction specifying sequence.  */
 530 #define CODING_ISO_FLAG_DIRECTION       0x0100
 531
 532 /* If set, assume designation states are reset at beginning of line on
 533    output.  */
 534 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 535
 536 /* If set, designation sequence should be placed at beginning of line
 537    on output.  */
 538 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 539
 540 /* If set, do not encode unsafe charactes on output.  */
 541 #define CODING_ISO_FLAG_SAFE            0x0800
 542
 543 /* If set, extra latin codes (128..159) are accepted as a valid code
 544    on input.  */
 545 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 546
 547 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 548
 549 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 550
 551 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 552
 553 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 554
 555 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 556
 557 /* A character to be produced on output if encoding of the original
 558    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 559 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 560
 561 /* UTF-8 section */
 562 #define CODING_UTF_8_BOM(coding)        \
 563   ((coding)->spec.utf_8_bom)
 564
 565 /* UTF-16 section */
 566 #define CODING_UTF_16_BOM(coding)       \
 567   ((coding)->spec.utf_16.bom)
 568
 569 #define CODING_UTF_16_ENDIAN(coding)    \
 570   ((coding)->spec.utf_16.endian)
 571
 572 #define CODING_UTF_16_SURROGATE(coding) \
 573   ((coding)->spec.utf_16.surrogate)
 574
 575
 576 /* CCL section */
 577 #define CODING_CCL_DECODER(coding)      \
 578   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 579 #define CODING_CCL_ENCODER(coding)      \
 580   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 581 #define CODING_CCL_VALIDS(coding)                                          \
 582   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 583
 584 /* Index for each coding category in `coding_categories' */
 585
 586 enum coding_category
 587   {
 588     coding_category_iso_7,
 589     coding_category_iso_7_tight,
 590     coding_category_iso_8_1,
 591     coding_category_iso_8_2,
 592     coding_category_iso_7_else,
 593     coding_category_iso_8_else,
 594     coding_category_utf_8_auto,
 595     coding_category_utf_8_nosig,
 596     coding_category_utf_8_sig,
 597     coding_category_utf_16_auto,
 598     coding_category_utf_16_be,
 599     coding_category_utf_16_le,
 600     coding_category_utf_16_be_nosig,
 601     coding_category_utf_16_le_nosig,
 602     coding_category_charset,
 603     coding_category_sjis,
 604     coding_category_big5,
 605     coding_category_ccl,
 606     coding_category_emacs_mule,
 607     /* All above are targets of code detection.  */
 608     coding_category_raw_text,
 609     coding_category_undecided,
 610     coding_category_max
 611   };
 612
 613 /* Definitions of flag bits used in detect_coding_XXXX.  */
 614 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 615 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 616 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 617 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 618 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 619 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 620 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 621 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 622 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 623 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8_AUTO           \
 645    | CATEGORY_MASK_UTF_8_NOSIG          \
 646    | CATEGORY_MASK_UTF_8_SIG            \
 647    | CATEGORY_MASK_UTF_16_AUTO          \
 648    | CATEGORY_MASK_UTF_16_BE            \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 652    | CATEGORY_MASK_CHARSET              \
 653    | CATEGORY_MASK_SJIS                 \
 654    | CATEGORY_MASK_BIG5                 \
 655    | CATEGORY_MASK_CCL                  \
 656    | CATEGORY_MASK_EMACS_MULE)
 657
 658
 659 #define CATEGORY_MASK_ISO_7BIT \
 660   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 661
 662 #define CATEGORY_MASK_ISO_8BIT \
 663   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 664
 665 #define CATEGORY_MASK_ISO_ELSE \
 666   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 667
 668 #define CATEGORY_MASK_ISO_ESCAPE        \
 669   (CATEGORY_MASK_ISO_7                  \
 670    | CATEGORY_MASK_ISO_7_TIGHT          \
 671    | CATEGORY_MASK_ISO_7_ELSE           \
 672    | CATEGORY_MASK_ISO_8_ELSE)
 673
 674 #define CATEGORY_MASK_ISO       \
 675   (  CATEGORY_MASK_ISO_7BIT     \
 676      | CATEGORY_MASK_ISO_8BIT   \
 677      | CATEGORY_MASK_ISO_ELSE)
 678
 679 #define CATEGORY_MASK_UTF_16            \
 680   (CATEGORY_MASK_UTF_16_AUTO            \
 681    | CATEGORY_MASK_UTF_16_BE            \
 682    | CATEGORY_MASK_UTF_16_LE            \
 683    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 684    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 685
 686 #define CATEGORY_MASK_UTF_8     \
 687   (CATEGORY_MASK_UTF_8_AUTO     \
 688    | CATEGORY_MASK_UTF_8_NOSIG  \
 689    | CATEGORY_MASK_UTF_8_SIG)
 690
 691 /* List of symbols `coding-category-xxx' ordered by priority.  This
 692    variable is exposed to Emacs Lisp.  */
 693 static Lisp_Object Vcoding_category_list;
 694
 695 /* Table of coding categories (Lisp symbols).  This variable is for
 696    internal use oly.  */
 697 static Lisp_Object Vcoding_category_table;
 698
 699 /* Table of coding-categories ordered by priority.  */
 700 static enum coding_category coding_priorities[coding_category_max];
 701
 702 /* Nth element is a coding context for the coding system bound to the
 703    Nth coding category.  */
 704 static struct coding_system coding_categories[coding_category_max];
 705
 706 /*** Commonly used macros and functions ***/
 707
 708 #ifndef min
 709 #define min(a, b) ((a) < (b) ? (a) : (b))
 710 #endif
 711 #ifndef max
 712 #define max(a, b) ((a) > (b) ? (a) : (b))
 713 #endif
 714
 715 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 716   do {                                                  \
 717     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 718     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 719   } while (0)
 720
 721
 722 /* Safely get one byte from the source text pointed by SRC which ends
 723    at SRC_END, and set C to that byte.  If there are not enough bytes
 724    in the source, it jumps to `no_more_source'.  If multibytep is
 725    nonzero, and a multibyte character is found at SRC, set C to the
 726    negative value of the character code.  The caller should declare
 727    and set these variables appropriately in advance:
 728         src, src_end, multibytep */
 729
 730 #define ONE_MORE_BYTE(c)                                \
 731   do {                                                  \
 732     if (src == src_end)                                 \
 733       {                                                 \
 734         if (src_base < src)                             \
 735           record_conversion_result                      \
 736             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 737         goto no_more_source;                            \
 738       }                                                 \
 739     c = *src++;                                         \
 740     if (multibytep && (c & 0x80))                       \
 741       {                                                 \
 742         if ((c & 0xFE) == 0xC0)                         \
 743           c = ((c & 1) << 6) | *src++;                  \
 744         else                                            \
 745           {                                             \
 746             src--;                                      \
 747             c = - string_char (src, &src, NULL);        \
 748             record_conversion_result                    \
 749               (coding, CODING_RESULT_INVALID_SRC);      \
 750           }                                             \
 751       }                                                 \
 752     consumed_chars++;                                   \
 753   } while (0)
 754
 755 /* Safely get two bytes from the source text pointed by SRC which ends
 756    at SRC_END, and set C1 and C2 to those bytes while skipping the
 757    heading multibyte characters.  If there are not enough bytes in the
 758    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 759    a multibyte character is found for C2, set C2 to the negative value
 760    of the character code.  The caller should declare and set these
 761    variables appropriately in advance:
 762         src, src_end, multibytep
 763    It is intended that this macro is used in detect_coding_utf_16.  */
 764
 765 #define TWO_MORE_BYTES(c1, c2)                          \
 766   do {                                                  \
 767     do {                                                \
 768       if (src == src_end)                               \
 769         goto no_more_source;                            \
 770       c1 = *src++;                                      \
 771       if (multibytep && (c1 & 0x80))                    \
 772         {                                               \
 773           if ((c1 & 0xFE) == 0xC0)                      \
 774             c1 = ((c1 & 1) << 6) | *src++;              \
 775           else                                          \
 776             {                                           \
 777               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 778               c1 = -1;                                  \
 779             }                                           \
 780         }                                               \
 781     } while (c1 < 0);                                   \
 782     if (src == src_end)                                 \
 783       goto no_more_source;                              \
 784     c2 = *src++;                                        \
 785     if (multibytep && (c2 & 0x80))                      \
 786       {                                                 \
 787         if ((c2 & 0xFE) == 0xC0)                        \
 788           c2 = ((c2 & 1) << 6) | *src++;                \
 789         else                                            \
 790           c2 = -1;                                      \
 791       }                                                 \
 792   } while (0)
 793
 794
 795 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 796   do {                                                  \
 797     c = *src++;                                         \
 798     if (multibytep && (c & 0x80))                       \
 799       {                                                 \
 800         if ((c & 0xFE) == 0xC0)                         \
 801           c = ((c & 1) << 6) | *src++;                  \
 802         else                                            \
 803           {                                             \
 804             src--;                                      \
 805             c = - string_char (src, &src, NULL);        \
 806             record_conversion_result                    \
 807               (coding, CODING_RESULT_INVALID_SRC);      \
 808           }                                             \
 809       }                                                 \
 810     consumed_chars++;                                   \
 811   } while (0)
 812
 813
 814 /* Store a byte C in the place pointed by DST and increment DST to the
 815    next free point, and increment PRODUCED_CHARS.  The caller should
 816    assure that C is 0..127, and declare and set the variable `dst'
 817    appropriately in advance.
 818 */
 819
 820
 821 #define EMIT_ONE_ASCII_BYTE(c)  \
 822   do {                          \
 823     produced_chars++;           \
 824     *dst++ = (c);               \
 825   } while (0)
 826
 827
 828 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 829
 830 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 831   do {                                  \
 832     produced_chars += 2;                \
 833     *dst++ = (c1), *dst++ = (c2);       \
 834   } while (0)
 835
 836
 837 /* Store a byte C in the place pointed by DST and increment DST to the
 838    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 839    nonzero, store in an appropriate multibyte from.  The caller should
 840    declare and set the variables `dst' and `multibytep' appropriately
 841    in advance.  */
 842
 843 #define EMIT_ONE_BYTE(c)                \
 844   do {                                  \
 845     produced_chars++;                   \
 846     if (multibytep)                     \
 847       {                                 \
 848         int ch = (c);                   \
 849         if (ch >= 0x80)                 \
 850           ch = BYTE8_TO_CHAR (ch);      \
 851         CHAR_STRING_ADVANCE (ch, dst);  \
 852       }                                 \
 853     else                                \
 854       *dst++ = (c);                     \
 855   } while (0)
 856
 857
 858 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 859
 860 #define EMIT_TWO_BYTES(c1, c2)          \
 861   do {                                  \
 862     produced_chars += 2;                \
 863     if (multibytep)                     \
 864       {                                 \
 865         int ch;                         \
 866                                         \
 867         ch = (c1);                      \
 868         if (ch >= 0x80)                 \
 869           ch = BYTE8_TO_CHAR (ch);      \
 870         CHAR_STRING_ADVANCE (ch, dst);  \
 871         ch = (c2);                      \
 872         if (ch >= 0x80)                 \
 873           ch = BYTE8_TO_CHAR (ch);      \
 874         CHAR_STRING_ADVANCE (ch, dst);  \
 875       }                                 \
 876     else                                \
 877       {                                 \
 878         *dst++ = (c1);                  \
 879         *dst++ = (c2);                  \
 880       }                                 \
 881   } while (0)
 882
 883
 884 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 885   do {                                  \
 886     EMIT_ONE_BYTE (c1);                 \
 887     EMIT_TWO_BYTES (c2, c3);            \
 888   } while (0)
 889
 890
 891 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 892   do {                                          \
 893     EMIT_TWO_BYTES (c1, c2);                    \
 894     EMIT_TWO_BYTES (c3, c4);                    \
 895   } while (0)
 896
 897
 898 /* Prototypes for static functions.  */
 899 static void record_conversion_result P_ ((struct coding_system *coding,
 900                                           enum coding_result_code result));
 901 static int detect_coding_utf_8 P_ ((struct coding_system *,
 902                                     struct coding_detection_info *info));
 903 static void decode_coding_utf_8 P_ ((struct coding_system *));
 904 static int encode_coding_utf_8 P_ ((struct coding_system *));
 905
 906 static int detect_coding_utf_16 P_ ((struct coding_system *,
 907                                      struct coding_detection_info *info));
 908 static void decode_coding_utf_16 P_ ((struct coding_system *));
 909 static int encode_coding_utf_16 P_ ((struct coding_system *));
 910
 911 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 912                                        struct coding_detection_info *info));
 913 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 914 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 915
 916 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 917                                          struct coding_detection_info *info));
 918 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 919 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 920
 921 static int detect_coding_sjis P_ ((struct coding_system *,
 922                                    struct coding_detection_info *info));
 923 static void decode_coding_sjis P_ ((struct coding_system *));
 924 static int encode_coding_sjis P_ ((struct coding_system *));
 925
 926 static int detect_coding_big5 P_ ((struct coding_system *,
 927                                    struct coding_detection_info *info));
 928 static void decode_coding_big5 P_ ((struct coding_system *));
 929 static int encode_coding_big5 P_ ((struct coding_system *));
 930
 931 static int detect_coding_ccl P_ ((struct coding_system *,
 932                                   struct coding_detection_info *info));
 933 static void decode_coding_ccl P_ ((struct coding_system *));
 934 static int encode_coding_ccl P_ ((struct coding_system *));
 935
 936 static void decode_coding_raw_text P_ ((struct coding_system *));
 937 static int encode_coding_raw_text P_ ((struct coding_system *));
 938
 939 static void coding_set_source P_ ((struct coding_system *));
 940 static void coding_set_destination P_ ((struct coding_system *));
 941 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 942 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 943                                             EMACS_INT, EMACS_INT));
 944 static unsigned char *alloc_destination P_ ((struct coding_system *,
 945                                              EMACS_INT, unsigned char *));
 946 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 947 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 948                                                      int *, int *,
 949                                                      unsigned char *));
 950 static int detect_eol P_ ((const unsigned char *,
 951                            EMACS_INT, enum coding_category));
 952 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 953 static void decode_eol P_ ((struct coding_system *));
 954 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 955 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 956 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 957 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 958                                         EMACS_INT));
 959 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 960 static int decode_coding P_ ((struct coding_system *));
 961 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 962                                                       struct coding_system *,
 963                                                       int *, EMACS_INT *));
 964 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 965                                                   struct coding_system *,
 966                                                   int *, EMACS_INT *));
 967 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 968 static int encode_coding P_ ((struct coding_system *));
 969 static Lisp_Object make_conversion_work_buffer P_ ((int));
 970 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 971 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 972 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 973
 974 static void
 975 record_conversion_result (struct coding_system *coding,
 976                           enum coding_result_code result)
 977 {
 978   coding->result = result;
 979   switch (result)
 980     {
 981     case CODING_RESULT_INSUFFICIENT_SRC:
 982       Vlast_code_conversion_error = Qinsufficient_source;
 983       break;
 984     case CODING_RESULT_INCONSISTENT_EOL:
 985       Vlast_code_conversion_error = Qinconsistent_eol;
 986       break;
 987     case CODING_RESULT_INVALID_SRC:
 988       Vlast_code_conversion_error = Qinvalid_source;
 989       break;
 990     case CODING_RESULT_INTERRUPT:
 991       Vlast_code_conversion_error = Qinterrupted;
 992       break;
 993     case CODING_RESULT_INSUFFICIENT_MEM:
 994       Vlast_code_conversion_error = Qinsufficient_memory;
 995       break;
 996     case CODING_RESULT_SUCCESS:
 997       break;
 998     default:
 999       Vlast_code_conversion_error = intern ("Unknown error");
1000     }
1001 }
1002
1003 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1004   do {                                                                       \
1005     charset_map_loaded = 0;                                                  \
1006     c = DECODE_CHAR (charset, code);                                         \
1007     if (charset_map_loaded)                                                  \
1008       {                                                                      \
1009         const unsigned char *orig = coding->source;                          \
1010         EMACS_INT offset;                                                    \
1011                                                                              \
1012         coding_set_source (coding);                                          \
1013         offset = coding->source - orig;                                      \
1014         src += offset;                                                       \
1015         src_base += offset;                                                  \
1016         src_end += offset;                                                   \
1017       }                                                                      \
1018   } while (0)
1019
1020
1021 /* If there are at least BYTES length of room at dst, allocate memory
1022    for coding->destination and update dst and dst_end.  We don't have
1023    to take care of coding->source which will be relocated.  It is
1024    handled by calling coding_set_source in encode_coding.  */
1025
1026 #define ASSURE_DESTINATION(bytes)                               \
1027   do {                                                          \
1028     if (dst + (bytes) >= dst_end)                               \
1029       {                                                         \
1030         int more_bytes = charbuf_end - charbuf + (bytes);       \
1031                                                                 \
1032         dst = alloc_destination (coding, more_bytes, dst);      \
1033         dst_end = coding->destination + coding->dst_bytes;      \
1034       }                                                         \
1035   } while (0)
1036
1037
1038 /* Store multibyte form of the character C in P, and advance P to the
1039    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1040    never calls MAYBE_UNIFY_CHAR.  */
1041
1042 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1043   do {                                          \
1044     if ((c) <= MAX_1_BYTE_CHAR)                 \
1045       *(p)++ = (c);                             \
1046     else if ((c) <= MAX_2_BYTE_CHAR)            \
1047       *(p)++ = (0xC0 | ((c) >> 6)),             \
1048         *(p)++ = (0x80 | ((c) & 0x3F));         \
1049     else if ((c) <= MAX_3_BYTE_CHAR)            \
1050       *(p)++ = (0xE0 | ((c) >> 12)),            \
1051         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1052         *(p)++ = (0x80 | ((c) & 0x3F));         \
1053     else if ((c) <= MAX_4_BYTE_CHAR)            \
1054       *(p)++ = (0xF0 | (c >> 18)),              \
1055         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1056         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1057         *(p)++ = (0x80 | (c & 0x3F));           \
1058     else if ((c) <= MAX_5_BYTE_CHAR)            \
1059       *(p)++ = 0xF8,                            \
1060         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1061         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1062         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1063         *(p)++ = (0x80 | (c & 0x3F));           \
1064     else                                        \
1065       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1066   } while (0)
1067
1068
1069 /* Return the character code of character whose multibyte form is at
1070    P, and advance P to the end of the multibyte form.  This is like
1071    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1072
1073 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1074   (!((p)[0] & 0x80)                                             \
1075    ? *(p)++                                                     \
1076    : ! ((p)[0] & 0x20)                                          \
1077    ? ((p) += 2,                                                 \
1078       ((((p)[-2] & 0x1F) << 6)                                  \
1079        | ((p)[-1] & 0x3F)                                       \
1080        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1081    : ! ((p)[0] & 0x10)                                          \
1082    ? ((p) += 3,                                                 \
1083       ((((p)[-3] & 0x0F) << 12)                                 \
1084        | (((p)[-2] & 0x3F) << 6)                                \
1085        | ((p)[-1] & 0x3F)))                                     \
1086    : ! ((p)[0] & 0x08)                                          \
1087    ? ((p) += 4,                                                 \
1088       ((((p)[-4] & 0xF) << 18)                                  \
1089        | (((p)[-3] & 0x3F) << 12)                               \
1090        | (((p)[-2] & 0x3F) << 6)                                \
1091        | ((p)[-1] & 0x3F)))                                     \
1092    : ((p) += 5,                                                 \
1093       ((((p)[-4] & 0x3F) << 18)                                 \
1094        | (((p)[-3] & 0x3F) << 12)                               \
1095        | (((p)[-2] & 0x3F) << 6)                                \
1096        | ((p)[-1] & 0x3F))))
1097
1098
1099 static void
1100 coding_set_source (coding)
1101      struct coding_system *coding;
1102 {
1103   if (BUFFERP (coding->src_object))
1104     {
1105       struct buffer *buf = XBUFFER (coding->src_object);
1106
1107       if (coding->src_pos < 0)
1108         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1109       else
1110         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1111     }
1112   else if (STRINGP (coding->src_object))
1113     {
1114       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1115     }
1116   else
1117     /* Otherwise, the source is C string and is never relocated
1118        automatically.  Thus we don't have to update anything.  */
1119     ;
1120 }
1121
1122 static void
1123 coding_set_destination (coding)
1124      struct coding_system *coding;
1125 {
1126   if (BUFFERP (coding->dst_object))
1127     {
1128       if (coding->src_pos < 0)
1129         {
1130           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1131           coding->dst_bytes = (GAP_END_ADDR
1132                                - (coding->src_bytes - coding->consumed)
1133                                - coding->destination);
1134         }
1135       else
1136         {
1137           /* We are sure that coding->dst_pos_byte is before the gap
1138              of the buffer. */
1139           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1140                                  + coding->dst_pos_byte - BEG_BYTE);
1141           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1142                                - coding->destination);
1143         }
1144     }
1145   else
1146     /* Otherwise, the destination is C string and is never relocated
1147        automatically.  Thus we don't have to update anything.  */
1148     ;
1149 }
1150
1151
1152 static void
1153 coding_alloc_by_realloc (coding, bytes)
1154      struct coding_system *coding;
1155      EMACS_INT bytes;
1156 {
1157   coding->destination = (unsigned char *) xrealloc (coding->destination,
1158                                                     coding->dst_bytes + bytes);
1159   coding->dst_bytes += bytes;
1160 }
1161
1162 static void
1163 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1164      struct coding_system *coding;
1165      EMACS_INT gap_head_used, bytes;
1166 {
1167   if (EQ (coding->src_object, coding->dst_object))
1168     {
1169       /* The gap may contain the produced data at the head and not-yet
1170          consumed data at the tail.  To preserve those data, we at
1171          first make the gap size to zero, then increase the gap
1172          size.  */
1173       EMACS_INT add = GAP_SIZE;
1174
1175       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1176       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1177       make_gap (bytes);
1178       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1179       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1180     }
1181   else
1182     {
1183       Lisp_Object this_buffer;
1184
1185       this_buffer = Fcurrent_buffer ();
1186       set_buffer_internal (XBUFFER (coding->dst_object));
1187       make_gap (bytes);
1188       set_buffer_internal (XBUFFER (this_buffer));
1189     }
1190 }
1191
1192
1193 static unsigned char *
1194 alloc_destination (coding, nbytes, dst)
1195      struct coding_system *coding;
1196      EMACS_INT nbytes;
1197      unsigned char *dst;
1198 {
1199   EMACS_INT offset = dst - coding->destination;
1200
1201   if (BUFFERP (coding->dst_object))
1202     {
1203       struct buffer *buf = XBUFFER (coding->dst_object);
1204
1205       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1206     }
1207   else
1208     coding_alloc_by_realloc (coding, nbytes);
1209   coding_set_destination (coding);
1210   dst = coding->destination + offset;
1211   return dst;
1212 }
1213
1214 /** Macros for annotations.  */
1215
1216 /* An annotation data is stored in the array coding->charbuf in this
1217    format:
1218      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1219    LENGTH is the number of elements in the annotation.
1220    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1221    NCHARS is the number of characters in the text annotated.
1222
1223    The format of the following elements depend on ANNOTATION_MASK.
1224
1225    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1226    follows:
1227      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1228
1229    NBYTES is the number of bytes specified in the header part of
1230    old-style emacs-mule encoding, or 0 for the other kind of
1231    composition.
1232
1233    METHOD is one of enum composition_method.
1234
1235    Optionnal COMPOSITION-COMPONENTS are characters and composition
1236    rules.
1237
1238    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1239    follows.
1240
1241    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1242    recover from an invalid annotation, and should be skipped by
1243    produce_annotation.  */
1244
1245 /* Maximum length of the header of annotation data.  */
1246 #define MAX_ANNOTATION_LENGTH 5
1247
1248 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1249   do {                                                  \
1250     *(buf)++ = -(len);                                  \
1251     *(buf)++ = (mask);                                  \
1252     *(buf)++ = (nchars);                                \
1253     coding->annotated = 1;                              \
1254   } while (0);
1255
1256 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1257   do {                                                                      \
1258     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1259     *buf++ = nbytes;                                                        \
1260     *buf++ = method;                                                        \
1261   } while (0)
1262
1263
1264 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1265   do {                                                                  \
1266     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1267     *buf++ = id;                                                        \
1268   } while (0)
1269
1270 \f
1271 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1272
1273
1274
1275 \f
1276 /*** 3. UTF-8 ***/
1277
1278 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1279    Check if a text is encoded in UTF-8.  If it is, return 1, else
1280    return 0.  */
1281
1282 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1283 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1284 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1285 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1286 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1287 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1288
1289 #define UTF_BOM 0xFEFF
1290 #define UTF_8_BOM_1 0xEF
1291 #define UTF_8_BOM_2 0xBB
1292 #define UTF_8_BOM_3 0xBF
1293
1294 static int
1295 detect_coding_utf_8 (coding, detect_info)
1296      struct coding_system *coding;
1297      struct coding_detection_info *detect_info;
1298 {
1299   const unsigned char *src = coding->source, *src_base;
1300   const unsigned char *src_end = coding->source + coding->src_bytes;
1301   int multibytep = coding->src_multibyte;
1302   int consumed_chars = 0;
1303   int bom_found = 0;
1304   int found = 0;
1305
1306   detect_info->checked |= CATEGORY_MASK_UTF_8;
1307   /* A coding system of this category is always ASCII compatible.  */
1308   src += coding->head_ascii;
1309
1310   while (1)
1311     {
1312       int c, c1, c2, c3, c4;
1313
1314       src_base = src;
1315       ONE_MORE_BYTE (c);
1316       if (c < 0 || UTF_8_1_OCTET_P (c))
1317         continue;
1318       ONE_MORE_BYTE (c1);
1319       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1320         break;
1321       if (UTF_8_2_OCTET_LEADING_P (c))
1322         {
1323           found = 1;
1324           continue;
1325         }
1326       ONE_MORE_BYTE (c2);
1327       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1328         break;
1329       if (UTF_8_3_OCTET_LEADING_P (c))
1330         {
1331           found = 1;
1332           if (src_base == coding->source
1333               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1334             bom_found = 1;
1335           continue;
1336         }
1337       ONE_MORE_BYTE (c3);
1338       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1339         break;
1340       if (UTF_8_4_OCTET_LEADING_P (c))
1341         {
1342           found = 1;
1343           continue;
1344         }
1345       ONE_MORE_BYTE (c4);
1346       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1347         break;
1348       if (UTF_8_5_OCTET_LEADING_P (c))
1349         {
1350           found = 1;
1351           continue;
1352         }
1353       break;
1354     }
1355   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1356   return 0;
1357
1358  no_more_source:
1359   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1360     {
1361       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1362       return 0;
1363     }
1364   if (bom_found)
1365     {
1366       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1367       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1368     }
1369   else
1370     {
1371       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1372       if (found)
1373         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1374     }
1375   return 1;
1376 }
1377
1378
1379 static void
1380 decode_coding_utf_8 (coding)
1381      struct coding_system *coding;
1382 {
1383   const unsigned char *src = coding->source + coding->consumed;
1384   const unsigned char *src_end = coding->source + coding->src_bytes;
1385   const unsigned char *src_base;
1386   int *charbuf = coding->charbuf + coding->charbuf_used;
1387   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1388   int consumed_chars = 0, consumed_chars_base = 0;
1389   int multibytep = coding->src_multibyte;
1390   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1391   Lisp_Object attr, charset_list;
1392   int eol_crlf =
1393     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1394   int byte_after_cr = -1;
1395
1396   CODING_GET_INFO (coding, attr, charset_list);
1397
1398   if (bom != utf_without_bom)
1399     {
1400       int c1, c2, c3;
1401
1402       src_base = src;
1403       ONE_MORE_BYTE (c1);
1404       if (! UTF_8_3_OCTET_LEADING_P (c1))
1405         src = src_base;
1406       else
1407         {
1408           ONE_MORE_BYTE (c2);
1409           if (! UTF_8_EXTRA_OCTET_P (c2))
1410             src = src_base;
1411           else
1412             {
1413               ONE_MORE_BYTE (c3);
1414               if (! UTF_8_EXTRA_OCTET_P (c3))
1415                 src = src_base;
1416               else
1417                 {
1418                   if ((c1 != UTF_8_BOM_1)
1419                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1420                     src = src_base;
1421                   else
1422                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1423                 }
1424             }
1425         }
1426     }
1427   CODING_UTF_8_BOM (coding) = utf_without_bom;
1428
1429
1430
1431   while (1)
1432     {
1433       int c, c1, c2, c3, c4, c5;
1434
1435       src_base = src;
1436       consumed_chars_base = consumed_chars;
1437
1438       if (charbuf >= charbuf_end)
1439         {
1440           if (byte_after_cr >= 0)
1441             src_base--;
1442           break;
1443         }
1444
1445       if (byte_after_cr >= 0)
1446         c1 = byte_after_cr, byte_after_cr = -1;
1447       else
1448         ONE_MORE_BYTE (c1);
1449       if (c1 < 0)
1450         {
1451           c = - c1;
1452         }
1453       else if (UTF_8_1_OCTET_P(c1))
1454         {
1455           if (eol_crlf && c1 == '\r')
1456             ONE_MORE_BYTE (byte_after_cr);
1457           c = c1;
1458         }
1459       else
1460         {
1461           ONE_MORE_BYTE (c2);
1462           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1463             goto invalid_code;
1464           if (UTF_8_2_OCTET_LEADING_P (c1))
1465             {
1466               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1467               /* Reject overlong sequences here and below.  Encoders
1468                  producing them are incorrect, they can be misleading,
1469                  and they mess up read/write invariance.  */
1470               if (c < 128)
1471                 goto invalid_code;
1472             }
1473           else
1474             {
1475               ONE_MORE_BYTE (c3);
1476               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1477                 goto invalid_code;
1478               if (UTF_8_3_OCTET_LEADING_P (c1))
1479                 {
1480                   c = (((c1 & 0xF) << 12)
1481                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1482                   if (c < 0x800
1483                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1484                     goto invalid_code;
1485                 }
1486               else
1487                 {
1488                   ONE_MORE_BYTE (c4);
1489                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1490                     goto invalid_code;
1491                   if (UTF_8_4_OCTET_LEADING_P (c1))
1492                     {
1493                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1494                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1495                     if (c < 0x10000)
1496                       goto invalid_code;
1497                     }
1498                   else
1499                     {
1500                       ONE_MORE_BYTE (c5);
1501                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1502                         goto invalid_code;
1503                       if (UTF_8_5_OCTET_LEADING_P (c1))
1504                         {
1505                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1506                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1507                                | (c5 & 0x3F));
1508                           if ((c > MAX_CHAR) || (c < 0x200000))
1509                             goto invalid_code;
1510                         }
1511                       else
1512                         goto invalid_code;
1513                     }
1514                 }
1515             }
1516         }
1517
1518       *charbuf++ = c;
1519       continue;
1520
1521     invalid_code:
1522       src = src_base;
1523       consumed_chars = consumed_chars_base;
1524       ONE_MORE_BYTE (c);
1525       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1526       coding->errors++;
1527     }
1528
1529  no_more_source:
1530   coding->consumed_char += consumed_chars_base;
1531   coding->consumed = src_base - coding->source;
1532   coding->charbuf_used = charbuf - coding->charbuf;
1533 }
1534
1535
1536 static int
1537 encode_coding_utf_8 (coding)
1538      struct coding_system *coding;
1539 {
1540   int multibytep = coding->dst_multibyte;
1541   int *charbuf = coding->charbuf;
1542   int *charbuf_end = charbuf + coding->charbuf_used;
1543   unsigned char *dst = coding->destination + coding->produced;
1544   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1545   int produced_chars = 0;
1546   int c;
1547
1548   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1549     {
1550       ASSURE_DESTINATION (3);
1551       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1552       CODING_UTF_8_BOM (coding) = utf_without_bom;
1553     }
1554
1555   if (multibytep)
1556     {
1557       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1558
1559       while (charbuf < charbuf_end)
1560         {
1561           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1562
1563           ASSURE_DESTINATION (safe_room);
1564           c = *charbuf++;
1565           if (CHAR_BYTE8_P (c))
1566             {
1567               c = CHAR_TO_BYTE8 (c);
1568               EMIT_ONE_BYTE (c);
1569             }
1570           else
1571             {
1572               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1573               for (p = str; p < pend; p++)
1574                 EMIT_ONE_BYTE (*p);
1575             }
1576         }
1577     }
1578   else
1579     {
1580       int safe_room = MAX_MULTIBYTE_LENGTH;
1581
1582       while (charbuf < charbuf_end)
1583         {
1584           ASSURE_DESTINATION (safe_room);
1585           c = *charbuf++;
1586           if (CHAR_BYTE8_P (c))
1587             *dst++ = CHAR_TO_BYTE8 (c);
1588           else
1589             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1590           produced_chars++;
1591         }
1592     }
1593   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1594   coding->produced_char += produced_chars;
1595   coding->produced = dst - coding->destination;
1596   return 0;
1597 }
1598
1599
1600 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1601    Check if a text is encoded in one of UTF-16 based coding systems.
1602    If it is, return 1, else return 0.  */
1603
1604 #define UTF_16_HIGH_SURROGATE_P(val) \
1605   (((val) & 0xFC00) == 0xD800)
1606
1607 #define UTF_16_LOW_SURROGATE_P(val) \
1608   (((val) & 0xFC00) == 0xDC00)
1609
1610 #define UTF_16_INVALID_P(val)   \
1611   (((val) == 0xFFFE)            \
1612    || ((val) == 0xFFFF)         \
1613    || UTF_16_LOW_SURROGATE_P (val))
1614
1615
1616 static int
1617 detect_coding_utf_16 (coding, detect_info)
1618      struct coding_system *coding;
1619      struct coding_detection_info *detect_info;
1620 {
1621   const unsigned char *src = coding->source, *src_base = src;
1622   const unsigned char *src_end = coding->source + coding->src_bytes;
1623   int multibytep = coding->src_multibyte;
1624   int consumed_chars = 0;
1625   int c1, c2;
1626
1627   detect_info->checked |= CATEGORY_MASK_UTF_16;
1628   if (coding->mode & CODING_MODE_LAST_BLOCK
1629       && (coding->src_chars & 1))
1630     {
1631       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1632       return 0;
1633     }
1634
1635   TWO_MORE_BYTES (c1, c2);
1636   if ((c1 == 0xFF) && (c2 == 0xFE))
1637     {
1638       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1639                              | CATEGORY_MASK_UTF_16_AUTO);
1640       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1641                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1642                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1643     }
1644   else if ((c1 == 0xFE) && (c2 == 0xFF))
1645     {
1646       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1647                              | CATEGORY_MASK_UTF_16_AUTO);
1648       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1649                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1650                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1651     }
1652   else if (c2 < 0)
1653     {
1654       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1655       return 0;
1656     }
1657   else
1658     {
1659       /* We check the dispersion of Eth and Oth bytes where E is even and
1660          O is odd.  If both are high, we assume binary data.*/
1661       unsigned char e[256], o[256];
1662       unsigned e_num = 1, o_num = 1;
1663
1664       memset (e, 0, 256);
1665       memset (o, 0, 256);
1666       e[c1] = 1;
1667       o[c2] = 1;
1668
1669       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1670                                 |CATEGORY_MASK_UTF_16_BE
1671                                 | CATEGORY_MASK_UTF_16_LE);
1672
1673       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1674              != CATEGORY_MASK_UTF_16)
1675         {
1676           TWO_MORE_BYTES (c1, c2);
1677           if (c2 < 0)
1678             break;
1679           if (! e[c1])
1680             {
1681               e[c1] = 1;
1682               e_num++;
1683               if (e_num >= 128)
1684                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1685             }
1686           if (! o[c2])
1687             {
1688               o[c2] = 1;
1689               o_num++;
1690               if (o_num >= 128)
1691                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1692             }
1693         }
1694       return 0;
1695     }
1696
1697  no_more_source:
1698   return 1;
1699 }
1700
1701 static void
1702 decode_coding_utf_16 (coding)
1703      struct coding_system *coding;
1704 {
1705   const unsigned char *src = coding->source + coding->consumed;
1706   const unsigned char *src_end = coding->source + coding->src_bytes;
1707   const unsigned char *src_base;
1708   int *charbuf = coding->charbuf + coding->charbuf_used;
1709   /* We may produces at most 3 chars in one loop.  */
1710   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1711   int consumed_chars = 0, consumed_chars_base = 0;
1712   int multibytep = coding->src_multibyte;
1713   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1714   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1715   int surrogate = CODING_UTF_16_SURROGATE (coding);
1716   Lisp_Object attr, charset_list;
1717   int eol_crlf =
1718     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1719   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1720
1721   CODING_GET_INFO (coding, attr, charset_list);
1722
1723   if (bom == utf_with_bom)
1724     {
1725       int c, c1, c2;
1726
1727       src_base = src;
1728       ONE_MORE_BYTE (c1);
1729       ONE_MORE_BYTE (c2);
1730       c = (c1 << 8) | c2;
1731
1732       if (endian == utf_16_big_endian
1733           ? c != 0xFEFF : c != 0xFFFE)
1734         {
1735           /* The first two bytes are not BOM.  Treat them as bytes
1736              for a normal character.  */
1737           src = src_base;
1738           coding->errors++;
1739         }
1740       CODING_UTF_16_BOM (coding) = utf_without_bom;
1741     }
1742   else if (bom == utf_detect_bom)
1743     {
1744       /* We have already tried to detect BOM and failed in
1745          detect_coding.  */
1746       CODING_UTF_16_BOM (coding) = utf_without_bom;
1747     }
1748
1749   while (1)
1750     {
1751       int c, c1, c2;
1752
1753       src_base = src;
1754       consumed_chars_base = consumed_chars;
1755
1756       if (charbuf >= charbuf_end)
1757         {
1758           if (byte_after_cr1 >= 0)
1759             src_base -= 2;
1760           break;
1761         }
1762
1763       if (byte_after_cr1 >= 0)
1764         c1 = byte_after_cr1, byte_after_cr1 = -1;
1765       else
1766         ONE_MORE_BYTE (c1);
1767       if (c1 < 0)
1768         {
1769           *charbuf++ = -c1;
1770           continue;
1771         }
1772       if (byte_after_cr2 >= 0)
1773         c2 = byte_after_cr2, byte_after_cr2 = -1;
1774       else
1775         ONE_MORE_BYTE (c2);
1776       if (c2 < 0)
1777         {
1778           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1779           *charbuf++ = -c2;
1780           continue;
1781         }
1782       c = (endian == utf_16_big_endian
1783            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1784
1785       if (surrogate)
1786         {
1787           if (! UTF_16_LOW_SURROGATE_P (c))
1788             {
1789               if (endian == utf_16_big_endian)
1790                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1791               else
1792                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1793               *charbuf++ = c1;
1794               *charbuf++ = c2;
1795               coding->errors++;
1796               if (UTF_16_HIGH_SURROGATE_P (c))
1797                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1798               else
1799                 *charbuf++ = c;
1800             }
1801           else
1802             {
1803               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1804               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1805               *charbuf++ = 0x10000 + c;
1806             }
1807         }
1808       else
1809         {
1810           if (UTF_16_HIGH_SURROGATE_P (c))
1811             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1812           else
1813             {
1814               if (eol_crlf && c == '\r')
1815                 {
1816                   ONE_MORE_BYTE (byte_after_cr1);
1817                   ONE_MORE_BYTE (byte_after_cr2);
1818                 }
1819               *charbuf++ = c;
1820             }
1821         }
1822     }
1823
1824  no_more_source:
1825   coding->consumed_char += consumed_chars_base;
1826   coding->consumed = src_base - coding->source;
1827   coding->charbuf_used = charbuf - coding->charbuf;
1828 }
1829
1830 static int
1831 encode_coding_utf_16 (coding)
1832      struct coding_system *coding;
1833 {
1834   int multibytep = coding->dst_multibyte;
1835   int *charbuf = coding->charbuf;
1836   int *charbuf_end = charbuf + coding->charbuf_used;
1837   unsigned char *dst = coding->destination + coding->produced;
1838   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1839   int safe_room = 8;
1840   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1841   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1842   int produced_chars = 0;
1843   Lisp_Object attrs, charset_list;
1844   int c;
1845
1846   CODING_GET_INFO (coding, attrs, charset_list);
1847
1848   if (bom != utf_without_bom)
1849     {
1850       ASSURE_DESTINATION (safe_room);
1851       if (big_endian)
1852         EMIT_TWO_BYTES (0xFE, 0xFF);
1853       else
1854         EMIT_TWO_BYTES (0xFF, 0xFE);
1855       CODING_UTF_16_BOM (coding) = utf_without_bom;
1856     }
1857
1858   while (charbuf < charbuf_end)
1859     {
1860       ASSURE_DESTINATION (safe_room);
1861       c = *charbuf++;
1862       if (c > MAX_UNICODE_CHAR)
1863         c = coding->default_char;
1864
1865       if (c < 0x10000)
1866         {
1867           if (big_endian)
1868             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1869           else
1870             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1871         }
1872       else
1873         {
1874           int c1, c2;
1875
1876           c -= 0x10000;
1877           c1 = (c >> 10) + 0xD800;
1878           c2 = (c & 0x3FF) + 0xDC00;
1879           if (big_endian)
1880             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1881           else
1882             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1883         }
1884     }
1885   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1886   coding->produced = dst - coding->destination;
1887   coding->produced_char += produced_chars;
1888   return 0;
1889 }
1890
1891 \f
1892 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1893
1894 /* Emacs' internal format for representation of multiple character
1895    sets is a kind of multi-byte encoding, i.e. characters are
1896    represented by variable-length sequences of one-byte codes.
1897
1898    ASCII characters and control characters (e.g. `tab', `newline') are
1899    represented by one-byte sequences which are their ASCII codes, in
1900    the range 0x00 through 0x7F.
1901
1902    8-bit characters of the range 0x80..0x9F are represented by
1903    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1904    code + 0x20).
1905
1906    8-bit characters of the range 0xA0..0xFF are represented by
1907    one-byte sequences which are their 8-bit code.
1908
1909    The other characters are represented by a sequence of `base
1910    leading-code', optional `extended leading-code', and one or two
1911    `position-code's.  The length of the sequence is determined by the
1912    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1913    whereas extended leading-code and position-code take the range 0xA0
1914    through 0xFF.  See `charset.h' for more details about leading-code
1915    and position-code.
1916
1917    --- CODE RANGE of Emacs' internal format ---
1918    character set        range
1919    -------------        -----
1920    ascii                0x00..0x7F
1921    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1922    eight-bit-graphic    0xA0..0xBF
1923    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1924    ---------------------------------------------
1925
1926    As this is the internal character representation, the format is
1927    usually not used externally (i.e. in a file or in a data sent to a
1928    process).  But, it is possible to have a text externally in this
1929    format (i.e. by encoding by the coding system `emacs-mule').
1930
1931    In that case, a sequence of one-byte codes has a slightly different
1932    form.
1933
1934    At first, all characters in eight-bit-control are represented by
1935    one-byte sequences which are their 8-bit code.
1936
1937    Next, character composition data are represented by the byte
1938    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1939    where,
1940         METHOD is 0xF2 plus one of composition method (enum
1941         composition_method),
1942
1943         BYTES is 0xA0 plus a byte length of this composition data,
1944
1945         CHARS is 0xA0 plus a number of characters composed by this
1946         data,
1947
1948         COMPONENTs are characters of multibye form or composition
1949         rules encoded by two-byte of ASCII codes.
1950
1951    In addition, for backward compatibility, the following formats are
1952    also recognized as composition data on decoding.
1953
1954    0x80 MSEQ ...
1955    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1956
1957    Here,
1958         MSEQ is a multibyte form but in these special format:
1959           ASCII: 0xA0 ASCII_CODE+0x80,
1960           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1961         RULE is a one byte code of the range 0xA0..0xF0 that
1962         represents a composition rule.
1963   */
1964
1965 char emacs_mule_bytes[256];
1966
1967
1968 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1969    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1970    else return 0.  */
1971
1972 static int
1973 detect_coding_emacs_mule (coding, detect_info)
1974      struct coding_system *coding;
1975      struct coding_detection_info *detect_info;
1976 {
1977   const unsigned char *src = coding->source, *src_base;
1978   const unsigned char *src_end = coding->source + coding->src_bytes;
1979   int multibytep = coding->src_multibyte;
1980   int consumed_chars = 0;
1981   int c;
1982   int found = 0;
1983
1984   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1985   /* A coding system of this category is always ASCII compatible.  */
1986   src += coding->head_ascii;
1987
1988   while (1)
1989     {
1990       src_base = src;
1991       ONE_MORE_BYTE (c);
1992       if (c < 0)
1993         continue;
1994       if (c == 0x80)
1995         {
1996           /* Perhaps the start of composite character.  We simply skip
1997              it because analyzing it is too heavy for detecting.  But,
1998              at least, we check that the composite character
1999              constitutes of more than 4 bytes.  */
2000           const unsigned char *src_base;
2001
2002         repeat:
2003           src_base = src;
2004           do
2005             {
2006               ONE_MORE_BYTE (c);
2007             }
2008           while (c >= 0xA0);
2009
2010           if (src - src_base <= 4)
2011             break;
2012           found = CATEGORY_MASK_EMACS_MULE;
2013           if (c == 0x80)
2014             goto repeat;
2015         }
2016
2017       if (c < 0x80)
2018         {
2019           if (c < 0x20
2020               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2021             break;
2022         }
2023       else
2024         {
2025           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2026
2027           while (more_bytes > 0)
2028             {
2029               ONE_MORE_BYTE (c);
2030               if (c < 0xA0)
2031                 {
2032                   src--;        /* Unread the last byte.  */
2033                   break;
2034                 }
2035               more_bytes--;
2036             }
2037           if (more_bytes != 0)
2038             break;
2039           found = CATEGORY_MASK_EMACS_MULE;
2040         }
2041     }
2042   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2043   return 0;
2044
2045  no_more_source:
2046   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2047     {
2048       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2049       return 0;
2050     }
2051   detect_info->found |= found;
2052   return 1;
2053 }
2054
2055
2056 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2057    character.  If CMP_STATUS indicates that we must expect MSEQ or
2058    RULE described above, decode it and return the negative value of
2059    the deocded character or rule.  If an invalid byte is found, return
2060    -1.  If SRC is too short, return -2.  */
2061
2062 int
2063 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2064      struct coding_system *coding;
2065      const unsigned char *src;
2066      int *nbytes, *nchars, *id;
2067      struct composition_status *cmp_status;
2068 {
2069   const unsigned char *src_end = coding->source + coding->src_bytes;
2070   const unsigned char *src_base = src;
2071   int multibytep = coding->src_multibyte;
2072   struct charset *charset;
2073   unsigned code;
2074   int c;
2075   int consumed_chars = 0;
2076   int mseq_found = 0;
2077
2078   ONE_MORE_BYTE (c);
2079   if (c < 0)
2080     {
2081       c = -c;
2082       charset = emacs_mule_charset[0];
2083     }
2084   else
2085     {
2086       if (c >= 0xA0)
2087         {
2088           if (cmp_status->state != COMPOSING_NO
2089               && cmp_status->old_form)
2090             {
2091               if (cmp_status->state == COMPOSING_CHAR)
2092                 {
2093                   if (c == 0xA0)
2094                     {
2095                       ONE_MORE_BYTE (c);
2096                       c -= 0x80;
2097                       if (c < 0)
2098                         goto invalid_code;
2099                     }
2100                   else
2101                     c -= 0x20;
2102                   mseq_found = 1;
2103                 }
2104               else
2105                 {
2106                   *nbytes = src - src_base;
2107                   *nchars = consumed_chars;
2108                   return -c;
2109                 }
2110             }
2111           else
2112             goto invalid_code;
2113         }
2114
2115       switch (emacs_mule_bytes[c])
2116         {
2117         case 2:
2118           if (! (charset = emacs_mule_charset[c]))
2119             goto invalid_code;
2120           ONE_MORE_BYTE (c);
2121           if (c < 0xA0)
2122             goto invalid_code;
2123           code = c & 0x7F;
2124           break;
2125
2126         case 3:
2127           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2128               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2129             {
2130               ONE_MORE_BYTE (c);
2131               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2132                 goto invalid_code;
2133               ONE_MORE_BYTE (c);
2134               if (c < 0xA0)
2135                 goto invalid_code;
2136               code = c & 0x7F;
2137             }
2138           else
2139             {
2140               if (! (charset = emacs_mule_charset[c]))
2141                 goto invalid_code;
2142               ONE_MORE_BYTE (c);
2143               if (c < 0xA0)
2144                 goto invalid_code;
2145               code = (c & 0x7F) << 8;
2146               ONE_MORE_BYTE (c);
2147               if (c < 0xA0)
2148                 goto invalid_code;
2149               code |= c & 0x7F;
2150             }
2151           break;
2152
2153         case 4:
2154           ONE_MORE_BYTE (c);
2155           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2156             goto invalid_code;
2157           ONE_MORE_BYTE (c);
2158           if (c < 0xA0)
2159             goto invalid_code;
2160           code = (c & 0x7F) << 8;
2161           ONE_MORE_BYTE (c);
2162           if (c < 0xA0)
2163             goto invalid_code;
2164           code |= c & 0x7F;
2165           break;
2166
2167         case 1:
2168           code = c;
2169           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2170                                      ? charset_ascii : charset_eight_bit);
2171           break;
2172
2173         default:
2174           abort ();
2175         }
2176       c = DECODE_CHAR (charset, code);
2177       if (c < 0)
2178         goto invalid_code;
2179     }
2180   *nbytes = src - src_base;
2181   *nchars = consumed_chars;
2182   if (id)
2183     *id = charset->id;
2184   return (mseq_found ? -c : c);
2185
2186  no_more_source:
2187   return -2;
2188
2189  invalid_code:
2190   return -1;
2191 }
2192
2193
2194 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2195
2196 /* Handle these composition sequence ('|': the end of header elements,
2197    BYTES and CHARS >= 0xA0):
2198
2199    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2200    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2201    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2202
2203    and these old form:
2204
2205    (4) relative composition: 0x80 | MSEQ ... MSEQ
2206    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2207
2208    When the starter 0x80 and the following header elements are found,
2209    this annotation header is produced.
2210
2211         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2212
2213    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2214    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2215
2216    Then, upon reading the following elements, these codes are produced
2217    until the composition end is found:
2218
2219    (1) CHAR ... CHAR
2220    (2) ALT ... ALT CHAR ... CHAR
2221    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2222    (4) CHAR ... CHAR
2223    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2224
2225    When the composition end is found, LENGTH and NCHARS in the
2226    annotation header is updated as below:
2227
2228    (1) LENGTH: unchanged, NCHARS: unchanged
2229    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2230    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2231    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2232    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2233
2234    If an error is found while composing, the annotation header is
2235    changed to the original composition header (plus filler -1s) as
2236    below:
2237
2238    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2239    (5)          [ 0x80 0xFF -1 -1- -1 ]
2240
2241    and the sequence [ -2 DECODED-RULE ] is changed to the original
2242    byte sequence as below:
2243         o the original byte sequence is B: [ B -1 ]
2244         o the original byte sequence is B1 B2: [ B1 B2 ]
2245
2246    Most of the routines are implemented by macros because many
2247    variables and labels in the caller decode_coding_emacs_mule must be
2248    accessible, and they are usually called just once (thus doesn't
2249    increase the size of compiled object).  */
2250
2251 /* Decode a composition rule represented by C as a component of
2252    composition sequence of Emacs 20 style.  Set RULE to the decoded
2253    rule. */
2254
2255 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2256   do {                                                  \
2257     int gref, nref;                                     \
2258                                                         \
2259     c -= 0xA0;                                          \
2260     if (c < 0 || c >= 81)                               \
2261       goto invalid_code;                                \
2262     gref = c / 9, nref = c % 9;                         \
2263     if (gref == 4) gref = 10;                           \
2264     if (nref == 4) nref = 10;                           \
2265     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2266   } while (0)
2267
2268
2269 /* Decode a composition rule represented by C and the following byte
2270    at SRC as a component of composition sequence of Emacs 21 style.
2271    Set RULE to the decoded rule.  */
2272
2273 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2274   do {                                                  \
2275     int gref, nref;                                     \
2276                                                         \
2277     gref = c - 0x20;                                    \
2278     if (gref < 0 || gref >= 81)                         \
2279       goto invalid_code;                                \
2280     ONE_MORE_BYTE (c);                                  \
2281     nref = c - 0x20;                                    \
2282     if (nref < 0 || nref >= 81)                         \
2283       goto invalid_code;                                \
2284     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2285   } while (0)
2286
2287
2288 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2289    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2290    byte length of this composition information, CHARS is the number of
2291    characters composed by this composition.  */
2292
2293 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2294   do {                                                                  \
2295     enum composition_method method = c - 0xF2;                          \
2296     int *charbuf_base = charbuf;                                        \
2297     int nbytes, nchars;                                                 \
2298                                                                         \
2299     ONE_MORE_BYTE (c);                                                  \
2300     if (c < 0)                                                          \
2301       goto invalid_code;                                                \
2302     nbytes = c - 0xA0;                                                  \
2303     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2304       goto invalid_code;                                                \
2305     ONE_MORE_BYTE (c);                                                  \
2306     nchars = c - 0xA0;                                                  \
2307     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2308       goto invalid_code;                                                \
2309     cmp_status->old_form = 0;                                           \
2310     cmp_status->method = method;                                        \
2311     if (method == COMPOSITION_RELATIVE)                                 \
2312       cmp_status->state = COMPOSING_CHAR;                               \
2313     else                                                                \
2314       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2315     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2316     cmp_status->nchars = nchars;                                        \
2317     cmp_status->ncomps = nbytes - 4;                                    \
2318     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2319   } while (0)
2320
2321
2322 /* Start of Emacs 20 style format for relative composition.  */
2323
2324 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2325   do {                                                          \
2326     cmp_status->old_form = 1;                                   \
2327     cmp_status->method = COMPOSITION_RELATIVE;                  \
2328     cmp_status->state = COMPOSING_CHAR;                         \
2329     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2330     cmp_status->nchars = cmp_status->ncomps = 0;                \
2331     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2332   } while (0)
2333
2334
2335 /* Start of Emacs 20 style format for rule-base composition.  */
2336
2337 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2338   do {                                                          \
2339     cmp_status->old_form = 1;                                   \
2340     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2341     cmp_status->state = COMPOSING_CHAR;                         \
2342     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2343     cmp_status->nchars = cmp_status->ncomps = 0;                \
2344     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2345   } while (0)
2346
2347
2348 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2349   do {                                                  \
2350     const unsigned char *current_src = src;             \
2351                                                         \
2352     ONE_MORE_BYTE (c);                                  \
2353     if (c < 0)                                          \
2354       goto invalid_code;                                \
2355     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2356         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2357       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2358     else if (c < 0xA0)                                  \
2359       goto invalid_code;                                \
2360     else if (c < 0xC0)                                  \
2361       {                                                 \
2362         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2363         /* Re-read C as a composition component.  */    \
2364         src = current_src;                              \
2365       }                                                 \
2366     else if (c == 0xFF)                                 \
2367       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2368     else                                                \
2369       goto invalid_code;                                \
2370   } while (0)
2371
2372 #define EMACS_MULE_COMPOSITION_END()                            \
2373   do {                                                          \
2374     int idx = - cmp_status->length;                             \
2375                                                                 \
2376     if (cmp_status->old_form)                                   \
2377       charbuf[idx + 2] = cmp_status->nchars;                    \
2378     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2379       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2380     cmp_status->state = COMPOSING_NO;                           \
2381   } while (0)
2382
2383
2384 static int
2385 emacs_mule_finish_composition (charbuf, cmp_status)
2386      int *charbuf;
2387      struct composition_status *cmp_status;
2388 {
2389   int idx = - cmp_status->length;
2390   int new_chars;
2391
2392   if (cmp_status->old_form && cmp_status->nchars > 0)
2393     {
2394       charbuf[idx + 2] = cmp_status->nchars;
2395       new_chars = 0;
2396       if (cmp_status->method == COMPOSITION_WITH_RULE
2397           && cmp_status->state == COMPOSING_CHAR)
2398         {
2399           /* The last rule was invalid.  */
2400           int rule = charbuf[-1] + 0xA0;
2401
2402           charbuf[-2] = BYTE8_TO_CHAR (rule);
2403           charbuf[-1] = -1;
2404           new_chars = 1;
2405         }
2406     }
2407   else
2408     {
2409       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2410
2411       if (cmp_status->method == COMPOSITION_WITH_RULE)
2412         {
2413           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2414           charbuf[idx++] = -3;
2415           charbuf[idx++] = 0;
2416           new_chars = 1;
2417         }
2418       else
2419         {
2420           int nchars = charbuf[idx + 1] + 0xA0;
2421           int nbytes = charbuf[idx + 2] + 0xA0;
2422
2423           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2424           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2425           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2426           charbuf[idx++] = -1;
2427           new_chars = 4;
2428         }
2429     }
2430   cmp_status->state = COMPOSING_NO;
2431   return new_chars;
2432 }
2433
2434 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2435   do {                                                                    \
2436     if (cmp_status->state != COMPOSING_NO)                                \
2437       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2438   } while (0)
2439
2440
2441 static void
2442 decode_coding_emacs_mule (coding)
2443      struct coding_system *coding;
2444 {
2445   const unsigned char *src = coding->source + coding->consumed;
2446   const unsigned char *src_end = coding->source + coding->src_bytes;
2447   const unsigned char *src_base;
2448   int *charbuf = coding->charbuf + coding->charbuf_used;
2449   /* We may produce two annocations (charset and composition) in one
2450      loop and one more charset annocation at the end.  */
2451   int *charbuf_end
2452     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2453   int consumed_chars = 0, consumed_chars_base;
2454   int multibytep = coding->src_multibyte;
2455   Lisp_Object attrs, charset_list;
2456   int char_offset = coding->produced_char;
2457   int last_offset = char_offset;
2458   int last_id = charset_ascii;
2459   int eol_crlf =
2460     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2461   int byte_after_cr = -1;
2462   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2463
2464   CODING_GET_INFO (coding, attrs, charset_list);
2465
2466   if (cmp_status->state != COMPOSING_NO)
2467     {
2468       int i;
2469
2470       for (i = 0; i < cmp_status->length; i++)
2471         *charbuf++ = cmp_status->carryover[i];
2472       coding->annotated = 1;
2473     }
2474
2475   while (1)
2476     {
2477       int c, id;
2478
2479       src_base = src;
2480       consumed_chars_base = consumed_chars;
2481
2482       if (charbuf >= charbuf_end)
2483         {
2484           if (byte_after_cr >= 0)
2485             src_base--;
2486           break;
2487         }
2488
2489       if (byte_after_cr >= 0)
2490         c = byte_after_cr, byte_after_cr = -1;
2491       else
2492         ONE_MORE_BYTE (c);
2493
2494       if (c < 0 || c == 0x80)
2495         {
2496           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2497           if (c < 0)
2498             {
2499               *charbuf++ = -c;
2500               char_offset++;
2501             }
2502           else
2503             DECODE_EMACS_MULE_COMPOSITION_START ();
2504           continue;
2505         }
2506
2507       if (c < 0x80)
2508         {
2509           if (eol_crlf && c == '\r')
2510             ONE_MORE_BYTE (byte_after_cr);
2511           id = charset_ascii;
2512           if (cmp_status->state != COMPOSING_NO)
2513             {
2514               if (cmp_status->old_form)
2515                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2516               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2517                 cmp_status->ncomps--;
2518             }
2519         }
2520       else
2521         {
2522           int nchars, nbytes;
2523
2524           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2525                                cmp_status);
2526           if (c < 0)
2527             {
2528               if (c == -1)
2529                 goto invalid_code;
2530               if (c == -2)
2531                 break;
2532             }
2533           src = src_base + nbytes;
2534           consumed_chars = consumed_chars_base + nchars;
2535           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2536             cmp_status->ncomps -= nchars;
2537         }
2538
2539       /* Now if C >= 0, we found a normally encoded characer, if C <
2540          0, we found an old-style composition component character or
2541          rule.  */
2542
2543       if (cmp_status->state == COMPOSING_NO)
2544         {
2545           if (last_id != id)
2546             {
2547               if (last_id != charset_ascii)
2548                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2549                                   last_id);
2550               last_id = id;
2551               last_offset = char_offset;
2552             }
2553           *charbuf++ = c;
2554           char_offset++;
2555         }
2556       else if (cmp_status->state == COMPOSING_CHAR)
2557         {
2558           if (cmp_status->old_form)
2559             {
2560               if (c >= 0)
2561                 {
2562                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2563                   *charbuf++ = c;
2564                   char_offset++;
2565                 }
2566               else
2567                 {
2568                   *charbuf++ = -c;
2569                   cmp_status->nchars++;
2570                   cmp_status->length++;
2571                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2572                     EMACS_MULE_COMPOSITION_END ();
2573                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2574                     cmp_status->state = COMPOSING_RULE;
2575                 }
2576             }
2577           else
2578             {
2579               *charbuf++ = c;
2580               cmp_status->length++;
2581               cmp_status->nchars--;
2582               if (cmp_status->nchars == 0)
2583                 EMACS_MULE_COMPOSITION_END ();
2584             }
2585         }
2586       else if (cmp_status->state == COMPOSING_RULE)
2587         {
2588           int rule;
2589
2590           if (c >= 0)
2591             {
2592               EMACS_MULE_COMPOSITION_END ();
2593               *charbuf++ = c;
2594               char_offset++;
2595             }
2596           else
2597             {
2598               c = -c;
2599               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2600               if (rule < 0)
2601                 goto invalid_code;
2602               *charbuf++ = -2;
2603               *charbuf++ = rule;
2604               cmp_status->length += 2;
2605               cmp_status->state = COMPOSING_CHAR;
2606             }
2607         }
2608       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2609         {
2610           *charbuf++ = c;
2611           cmp_status->length++;
2612           if (cmp_status->ncomps == 0)
2613             cmp_status->state = COMPOSING_CHAR;
2614           else if (cmp_status->ncomps > 0)
2615             {
2616               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2617                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2618             }
2619           else
2620             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2621         }
2622       else                      /* COMPOSING_COMPONENT_RULE */
2623         {
2624           int rule;
2625
2626           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2627           if (rule < 0)
2628             goto invalid_code;
2629           *charbuf++ = -2;
2630           *charbuf++ = rule;
2631           cmp_status->length += 2;
2632           cmp_status->ncomps--;
2633           if (cmp_status->ncomps > 0)
2634             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2635           else
2636             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2637         }
2638       continue;
2639
2640     retry:
2641       src = src_base;
2642       consumed_chars = consumed_chars_base;
2643       continue;
2644
2645     invalid_code:
2646       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2647       src = src_base;
2648       consumed_chars = consumed_chars_base;
2649       ONE_MORE_BYTE (c);
2650       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2651       char_offset++;
2652       coding->errors++;
2653     }
2654
2655  no_more_source:
2656   if (cmp_status->state != COMPOSING_NO)
2657     {
2658       if (coding->mode & CODING_MODE_LAST_BLOCK)
2659         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2660       else
2661         {
2662           int i;
2663
2664           charbuf -= cmp_status->length;
2665           for (i = 0; i < cmp_status->length; i++)
2666             cmp_status->carryover[i] = charbuf[i];
2667         }
2668     }
2669   if (last_id != charset_ascii)
2670     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2671   coding->consumed_char += consumed_chars_base;
2672   coding->consumed = src_base - coding->source;
2673   coding->charbuf_used = charbuf - coding->charbuf;
2674 }
2675
2676
2677 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2678   do {                                          \
2679     if (id < 0xA0)                              \
2680       codes[0] = id, codes[1] = 0;              \
2681     else if (id < 0xE0)                         \
2682       codes[0] = 0x9A, codes[1] = id;           \
2683     else if (id < 0xF0)                         \
2684       codes[0] = 0x9B, codes[1] = id;           \
2685     else if (id < 0xF5)                         \
2686       codes[0] = 0x9C, codes[1] = id;           \
2687     else                                        \
2688       codes[0] = 0x9D, codes[1] = id;           \
2689   } while (0);
2690
2691
2692 static int
2693 encode_coding_emacs_mule (coding)
2694      struct coding_system *coding;
2695 {
2696   int multibytep = coding->dst_multibyte;
2697   int *charbuf = coding->charbuf;
2698   int *charbuf_end = charbuf + coding->charbuf_used;
2699   unsigned char *dst = coding->destination + coding->produced;
2700   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2701   int safe_room = 8;
2702   int produced_chars = 0;
2703   Lisp_Object attrs, charset_list;
2704   int c;
2705   int preferred_charset_id = -1;
2706
2707   CODING_GET_INFO (coding, attrs, charset_list);
2708   if (! EQ (charset_list, Vemacs_mule_charset_list))
2709     {
2710       CODING_ATTR_CHARSET_LIST (attrs)
2711         = charset_list = Vemacs_mule_charset_list;
2712     }
2713
2714   while (charbuf < charbuf_end)
2715     {
2716       ASSURE_DESTINATION (safe_room);
2717       c = *charbuf++;
2718
2719       if (c < 0)
2720         {
2721           /* Handle an annotation.  */
2722           switch (*charbuf)
2723             {
2724             case CODING_ANNOTATE_COMPOSITION_MASK:
2725               /* Not yet implemented.  */
2726               break;
2727             case CODING_ANNOTATE_CHARSET_MASK:
2728               preferred_charset_id = charbuf[3];
2729               if (preferred_charset_id >= 0
2730                   && NILP (Fmemq (make_number (preferred_charset_id),
2731                                   charset_list)))
2732                 preferred_charset_id = -1;
2733               break;
2734             default:
2735               abort ();
2736             }
2737           charbuf += -c - 1;
2738           continue;
2739         }
2740
2741       if (ASCII_CHAR_P (c))
2742         EMIT_ONE_ASCII_BYTE (c);
2743       else if (CHAR_BYTE8_P (c))
2744         {
2745           c = CHAR_TO_BYTE8 (c);
2746           EMIT_ONE_BYTE (c);
2747         }
2748       else
2749         {
2750           struct charset *charset;
2751           unsigned code;
2752           int dimension;
2753           int emacs_mule_id;
2754           unsigned char leading_codes[2];
2755
2756           if (preferred_charset_id >= 0)
2757             {
2758               charset = CHARSET_FROM_ID (preferred_charset_id);
2759               if (CHAR_CHARSET_P (c, charset))
2760                 code = ENCODE_CHAR (charset, c);
2761               else
2762                 charset = char_charset (c, charset_list, &code);
2763             }
2764           else
2765             charset = char_charset (c, charset_list, &code);
2766           if (! charset)
2767             {
2768               c = coding->default_char;
2769               if (ASCII_CHAR_P (c))
2770                 {
2771                   EMIT_ONE_ASCII_BYTE (c);
2772                   continue;
2773                 }
2774               charset = char_charset (c, charset_list, &code);
2775             }
2776           dimension = CHARSET_DIMENSION (charset);
2777           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2778           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2779           EMIT_ONE_BYTE (leading_codes[0]);
2780           if (leading_codes[1])
2781             EMIT_ONE_BYTE (leading_codes[1]);
2782           if (dimension == 1)
2783             EMIT_ONE_BYTE (code | 0x80);
2784           else
2785             {
2786               code |= 0x8080;
2787               EMIT_ONE_BYTE (code >> 8);
2788               EMIT_ONE_BYTE (code & 0xFF);
2789             }
2790         }
2791     }
2792   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2793   coding->produced_char += produced_chars;
2794   coding->produced = dst - coding->destination;
2795   return 0;
2796 }
2797
2798 \f
2799 /*** 7. ISO2022 handlers ***/
2800
2801 /* The following note describes the coding system ISO2022 briefly.
2802    Since the intention of this note is to help understand the
2803    functions in this file, some parts are NOT ACCURATE or are OVERLY
2804    SIMPLIFIED.  For thorough understanding, please refer to the
2805    original document of ISO2022.  This is equivalent to the standard
2806    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2807
2808    ISO2022 provides many mechanisms to encode several character sets
2809    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2810    is encoded using bytes less than 128.  This may make the encoded
2811    text a little bit longer, but the text passes more easily through
2812    several types of gateway, some of which strip off the MSB (Most
2813    Significant Bit).
2814
2815    There are two kinds of character sets: control character sets and
2816    graphic character sets.  The former contain control characters such
2817    as `newline' and `escape' to provide control functions (control
2818    functions are also provided by escape sequences).  The latter
2819    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2820    two control character sets and many graphic character sets.
2821
2822    Graphic character sets are classified into one of the following
2823    four classes, according to the number of bytes (DIMENSION) and
2824    number of characters in one dimension (CHARS) of the set:
2825    - DIMENSION1_CHARS94
2826    - DIMENSION1_CHARS96
2827    - DIMENSION2_CHARS94
2828    - DIMENSION2_CHARS96
2829
2830    In addition, each character set is assigned an identification tag,
2831    unique for each set, called the "final character" (denoted as <F>
2832    hereafter).  The <F> of each character set is decided by ECMA(*)
2833    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2834    (0x30..0x3F are for private use only).
2835
2836    Note (*): ECMA = European Computer Manufacturers Association
2837
2838    Here are examples of graphic character sets [NAME(<F>)]:
2839         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2840         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2841         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2842         o DIMENSION2_CHARS96 -- none for the moment
2843
2844    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2845         C0 [0x00..0x1F] -- control character plane 0
2846         GL [0x20..0x7F] -- graphic character plane 0
2847         C1 [0x80..0x9F] -- control character plane 1
2848         GR [0xA0..0xFF] -- graphic character plane 1
2849
2850    A control character set is directly designated and invoked to C0 or
2851    C1 by an escape sequence.  The most common case is that:
2852    - ISO646's  control character set is designated/invoked to C0, and
2853    - ISO6429's control character set is designated/invoked to C1,
2854    and usually these designations/invocations are omitted in encoded
2855    text.  In a 7-bit environment, only C0 can be used, and a control
2856    character for C1 is encoded by an appropriate escape sequence to
2857    fit into the environment.  All control characters for C1 are
2858    defined to have corresponding escape sequences.
2859
2860    A graphic character set is at first designated to one of four
2861    graphic registers (G0 through G3), then these graphic registers are
2862    invoked to GL or GR.  These designations and invocations can be
2863    done independently.  The most common case is that G0 is invoked to
2864    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2865    these invocations and designations are omitted in encoded text.
2866    In a 7-bit environment, only GL can be used.
2867
2868    When a graphic character set of CHARS94 is invoked to GL, codes
2869    0x20 and 0x7F of the GL area work as control characters SPACE and
2870    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2871    be used.
2872
2873    There are two ways of invocation: locking-shift and single-shift.
2874    With locking-shift, the invocation lasts until the next different
2875    invocation, whereas with single-shift, the invocation affects the
2876    following character only and doesn't affect the locking-shift
2877    state.  Invocations are done by the following control characters or
2878    escape sequences:
2879
2880    ----------------------------------------------------------------------
2881    abbrev  function                  cntrl escape seq   description
2882    ----------------------------------------------------------------------
2883    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2884    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2885    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2886    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2887    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2888    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2889    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2890    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2891    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2892    ----------------------------------------------------------------------
2893    (*) These are not used by any known coding system.
2894
2895    Control characters for these functions are defined by macros
2896    ISO_CODE_XXX in `coding.h'.
2897
2898    Designations are done by the following escape sequences:
2899    ----------------------------------------------------------------------
2900    escape sequence      description
2901    ----------------------------------------------------------------------
2902    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2903    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2904    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2905    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2906    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2907    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2908    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2909    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2910    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2911    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2912    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2913    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2914    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2915    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2916    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2917    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2918    ----------------------------------------------------------------------
2919
2920    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2921    of dimension 1, chars 94, and final character <F>, etc...
2922
2923    Note (*): Although these designations are not allowed in ISO2022,
2924    Emacs accepts them on decoding, and produces them on encoding
2925    CHARS96 character sets in a coding system which is characterized as
2926    7-bit environment, non-locking-shift, and non-single-shift.
2927
2928    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2929    '(' must be omitted.  We refer to this as "short-form" hereafter.
2930
2931    Now you may notice that there are a lot of ways of encoding the
2932    same multilingual text in ISO2022.  Actually, there exist many
2933    coding systems such as Compound Text (used in X11's inter client
2934    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2935    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2936    localized platforms), and all of these are variants of ISO2022.
2937
2938    In addition to the above, Emacs handles two more kinds of escape
2939    sequences: ISO6429's direction specification and Emacs' private
2940    sequence for specifying character composition.
2941
2942    ISO6429's direction specification takes the following form:
2943         o CSI ']'      -- end of the current direction
2944         o CSI '0' ']'  -- end of the current direction
2945         o CSI '1' ']'  -- start of left-to-right text
2946         o CSI '2' ']'  -- start of right-to-left text
2947    The control character CSI (0x9B: control sequence introducer) is
2948    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2949
2950    Character composition specification takes the following form:
2951         o ESC '0' -- start relative composition
2952         o ESC '1' -- end composition
2953         o ESC '2' -- start rule-base composition (*)
2954         o ESC '3' -- start relative composition with alternate chars  (**)
2955         o ESC '4' -- start rule-base composition with alternate chars  (**)
2956   Since these are not standard escape sequences of any ISO standard,
2957   the use of them with these meanings is restricted to Emacs only.
2958
2959   (*) This form is used only in Emacs 20.7 and older versions,
2960   but newer versions can safely decode it.
2961   (**) This form is used only in Emacs 21.1 and newer versions,
2962   and older versions can't decode it.
2963
2964   Here's a list of example usages of these composition escape
2965   sequences (categorized by `enum composition_method').
2966
2967   COMPOSITION_RELATIVE:
2968         ESC 0 CHAR [ CHAR ] ESC 1
2969   COMPOSITION_WITH_RULE:
2970         ESC 2 CHAR [ RULE CHAR ] ESC 1
2971   COMPOSITION_WITH_ALTCHARS:
2972         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2973   COMPOSITION_WITH_RULE_ALTCHARS:
2974         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2975
2976 enum iso_code_class_type iso_code_class[256];
2977
2978 #define SAFE_CHARSET_P(coding, id)      \
2979   ((id) <= (coding)->max_charset_id     \
2980    && (coding)->safe_charsets[id] != 255)
2981
2982
2983 #define SHIFT_OUT_OK(category)  \
2984   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2985
2986 static void
2987 setup_iso_safe_charsets (attrs)
2988      Lisp_Object attrs;
2989 {
2990   Lisp_Object charset_list, safe_charsets;
2991   Lisp_Object request;
2992   Lisp_Object reg_usage;
2993   Lisp_Object tail;
2994   int reg94, reg96;
2995   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2996   int max_charset_id;
2997
2998   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2999   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
3000       && ! EQ (charset_list, Viso_2022_charset_list))
3001     {
3002       CODING_ATTR_CHARSET_LIST (attrs)
3003         = charset_list = Viso_2022_charset_list;
3004       ASET (attrs, coding_attr_safe_charsets, Qnil);
3005     }
3006
3007   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3008     return;
3009
3010   max_charset_id = 0;
3011   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3012     {
3013       int id = XINT (XCAR (tail));
3014       if (max_charset_id < id)
3015         max_charset_id = id;
3016     }
3017
3018   safe_charsets = make_uninit_string (max_charset_id + 1);
3019   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3020   request = AREF (attrs, coding_attr_iso_request);
3021   reg_usage = AREF (attrs, coding_attr_iso_usage);
3022   reg94 = XINT (XCAR (reg_usage));
3023   reg96 = XINT (XCDR (reg_usage));
3024
3025   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3026     {
3027       Lisp_Object id;
3028       Lisp_Object reg;
3029       struct charset *charset;
3030
3031       id = XCAR (tail);
3032       charset = CHARSET_FROM_ID (XINT (id));
3033       reg = Fcdr (Fassq (id, request));
3034       if (! NILP (reg))
3035         SSET (safe_charsets, XINT (id), XINT (reg));
3036       else if (charset->iso_chars_96)
3037         {
3038           if (reg96 < 4)
3039             SSET (safe_charsets, XINT (id), reg96);
3040         }
3041       else
3042         {
3043           if (reg94 < 4)
3044             SSET (safe_charsets, XINT (id), reg94);
3045         }
3046     }
3047   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3048 }
3049
3050
3051 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3052    Check if a text is encoded in one of ISO-2022 based codig systems.
3053    If it is, return 1, else return 0.  */
3054
3055 static int
3056 detect_coding_iso_2022 (coding, detect_info)
3057      struct coding_system *coding;
3058      struct coding_detection_info *detect_info;
3059 {
3060   const unsigned char *src = coding->source, *src_base = src;
3061   const unsigned char *src_end = coding->source + coding->src_bytes;
3062   int multibytep = coding->src_multibyte;
3063   int single_shifting = 0;
3064   int id;
3065   int c, c1;
3066   int consumed_chars = 0;
3067   int i;
3068   int rejected = 0;
3069   int found = 0;
3070   int composition_count = -1;
3071
3072   detect_info->checked |= CATEGORY_MASK_ISO;
3073
3074   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3075     {
3076       struct coding_system *this = &(coding_categories[i]);
3077       Lisp_Object attrs, val;
3078
3079       if (this->id < 0)
3080         continue;
3081       attrs = CODING_ID_ATTRS (this->id);
3082       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3083           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3084         setup_iso_safe_charsets (attrs);
3085       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3086       this->max_charset_id = SCHARS (val) - 1;
3087       this->safe_charsets = SDATA (val);
3088     }
3089
3090   /* A coding system of this category is always ASCII compatible.  */
3091   src += coding->head_ascii;
3092
3093   while (rejected != CATEGORY_MASK_ISO)
3094     {
3095       src_base = src;
3096       ONE_MORE_BYTE (c);
3097       switch (c)
3098         {
3099         case ISO_CODE_ESC:
3100           if (inhibit_iso_escape_detection)
3101             break;
3102           single_shifting = 0;
3103           ONE_MORE_BYTE (c);
3104           if (c >= '(' && c <= '/')
3105             {
3106               /* Designation sequence for a charset of dimension 1.  */
3107               ONE_MORE_BYTE (c1);
3108               if (c1 < ' ' || c1 >= 0x80
3109                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3110                 /* Invalid designation sequence.  Just ignore.  */
3111                 break;
3112             }
3113           else if (c == '$')
3114             {
3115               /* Designation sequence for a charset of dimension 2.  */
3116               ONE_MORE_BYTE (c);
3117               if (c >= '@' && c <= 'B')
3118                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3119                 id = iso_charset_table[1][0][c];
3120               else if (c >= '(' && c <= '/')
3121                 {
3122                   ONE_MORE_BYTE (c1);
3123                   if (c1 < ' ' || c1 >= 0x80
3124                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3125                     /* Invalid designation sequence.  Just ignore.  */
3126                     break;
3127                 }
3128               else
3129                 /* Invalid designation sequence.  Just ignore it.  */
3130                 break;
3131             }
3132           else if (c == 'N' || c == 'O')
3133             {
3134               /* ESC <Fe> for SS2 or SS3.  */
3135               single_shifting = 1;
3136               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3137               break;
3138             }
3139           else if (c == '1')
3140             {
3141               /* End of composition.  */
3142               if (composition_count < 0
3143                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3144                 /* Invalid */
3145                 break;
3146               composition_count = -1;
3147               found |= CATEGORY_MASK_ISO;
3148             }
3149           else if (c >= '0' && c <= '4')
3150             {
3151               /* ESC <Fp> for start/end composition.  */
3152               composition_count = 0;
3153               break;
3154             }
3155           else
3156             {
3157               /* Invalid escape sequence.  Just ignore it.  */
3158               break;
3159             }
3160
3161           /* We found a valid designation sequence for CHARSET.  */
3162           rejected |= CATEGORY_MASK_ISO_8BIT;
3163           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3164                               id))
3165             found |= CATEGORY_MASK_ISO_7;
3166           else
3167             rejected |= CATEGORY_MASK_ISO_7;
3168           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3169                               id))
3170             found |= CATEGORY_MASK_ISO_7_TIGHT;
3171           else
3172             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3173           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3174                               id))
3175             found |= CATEGORY_MASK_ISO_7_ELSE;
3176           else
3177             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3178           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3179                               id))
3180             found |= CATEGORY_MASK_ISO_8_ELSE;
3181           else
3182             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3183           break;
3184
3185         case ISO_CODE_SO:
3186         case ISO_CODE_SI:
3187           /* Locking shift out/in.  */
3188           if (inhibit_iso_escape_detection)
3189             break;
3190           single_shifting = 0;
3191           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3192           break;
3193
3194         case ISO_CODE_CSI:
3195           /* Control sequence introducer.  */
3196           single_shifting = 0;
3197           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3198           found |= CATEGORY_MASK_ISO_8_ELSE;
3199           goto check_extra_latin;
3200
3201         case ISO_CODE_SS2:
3202         case ISO_CODE_SS3:
3203           /* Single shift.   */
3204           if (inhibit_iso_escape_detection)
3205             break;
3206           single_shifting = 0;
3207           rejected |= CATEGORY_MASK_ISO_7BIT;
3208           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3209               & CODING_ISO_FLAG_SINGLE_SHIFT)
3210             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3211           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3212               & CODING_ISO_FLAG_SINGLE_SHIFT)
3213             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3214           if (single_shifting)
3215             break;
3216           goto check_extra_latin;
3217
3218         default:
3219           if (c < 0)
3220             continue;
3221           if (c < 0x80)
3222             {
3223               if (composition_count >= 0)
3224                 composition_count++;
3225               single_shifting = 0;
3226               break;
3227             }
3228           if (c >= 0xA0)
3229             {
3230               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3231               found |= CATEGORY_MASK_ISO_8_1;
3232               /* Check the length of succeeding codes of the range
3233                  0xA0..0FF.  If the byte length is even, we include
3234                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3235                  only when we are not single shifting.  */
3236               if (! single_shifting
3237                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3238                 {
3239                   int i = 1;
3240                   while (src < src_end)
3241                     {
3242                       ONE_MORE_BYTE (c);
3243                       if (c < 0xA0)
3244                         break;
3245                       i++;
3246                     }
3247
3248                   if (i & 1 && src < src_end)
3249                     {
3250                       rejected |= CATEGORY_MASK_ISO_8_2;
3251                       if (composition_count >= 0)
3252                         composition_count += i;
3253                     }
3254                   else
3255                     {
3256                       found |= CATEGORY_MASK_ISO_8_2;
3257                       if (composition_count >= 0)
3258                         composition_count += i / 2;
3259                     }
3260                 }
3261               break;
3262             }
3263         check_extra_latin:
3264           single_shifting = 0;
3265           if (! VECTORP (Vlatin_extra_code_table)
3266               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3267             {
3268               rejected = CATEGORY_MASK_ISO;
3269               break;
3270             }
3271           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3272               & CODING_ISO_FLAG_LATIN_EXTRA)
3273             found |= CATEGORY_MASK_ISO_8_1;
3274           else
3275             rejected |= CATEGORY_MASK_ISO_8_1;
3276           rejected |= CATEGORY_MASK_ISO_8_2;
3277         }
3278     }
3279   detect_info->rejected |= CATEGORY_MASK_ISO;
3280   return 0;
3281
3282  no_more_source:
3283   detect_info->rejected |= rejected;
3284   detect_info->found |= (found & ~rejected);
3285   return 1;
3286 }
3287
3288
3289 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3290    escape sequence should be kept.  */
3291 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3292   do {                                                                  \
3293     int id, prev;                                                       \
3294                                                                         \
3295     if (final < '0' || final >= 128                                     \
3296         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3297         || !SAFE_CHARSET_P (coding, id))                                \
3298       {                                                                 \
3299         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3300         chars_96 = -1;                                                  \
3301         break;                                                          \
3302       }                                                                 \
3303     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3304     if (id == charset_jisx0201_roman)                                   \
3305       {                                                                 \
3306         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3307           id = charset_ascii;                                           \
3308       }                                                                 \
3309     else if (id == charset_jisx0208_1978)                               \
3310       {                                                                 \
3311         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3312           id = charset_jisx0208;                                        \
3313       }                                                                 \
3314     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3315     /* If there was an invalid designation to REG previously, and this  \
3316        designation is ASCII to REG, we should keep this designation     \
3317        sequence.  */                                                    \
3318     if (prev == -2 && id == charset_ascii)                              \
3319       chars_96 = -1;                                                    \
3320   } while (0)
3321
3322
3323 /* Handle these composition sequence (ALT: alternate char):
3324
3325    (1) relative composition: ESC 0 CHAR ... ESC 1
3326    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3327    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3328    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3329
3330    When the start sequence (ESC 0/2/3/4) is found, this annotation
3331    header is produced.
3332
3333         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3334
3335    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3336    produced until the end sequence (ESC 1) is found:
3337
3338    (1) CHAR ... CHAR
3339    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3340    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3341    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3342
3343    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3344    annotation header is updated as below:
3345
3346    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3347    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3348    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3349    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3350
3351    If an error is found while composing, the annotation header is
3352    changed to:
3353
3354         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3355
3356    and the sequence [ -2 DECODED-RULE ] is changed to the original
3357    byte sequence as below:
3358         o the original byte sequence is B: [ B -1 ]
3359         o the original byte sequence is B1 B2: [ B1 B2 ]
3360    and the sequence [ -1 -1 ] is changed to the original byte
3361    sequence:
3362         [ ESC '0' ]
3363 */
3364
3365 /* Decode a composition rule C1 and maybe one more byte from the
3366    source, and set RULE to the encoded composition rule, NBYTES to the
3367    length of the composition rule.  If the rule is invalid, set RULE
3368    to some negative value.  */
3369
3370 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3371   do {                                                                  \
3372     rule = c1 - 32;                                                     \
3373     if (rule < 0)                                                       \
3374       break;                                                            \
3375     if (rule < 81)              /* old format (before ver.21) */        \
3376       {                                                                 \
3377         int gref = (rule) / 9;                                          \
3378         int nref = (rule) % 9;                                          \
3379         if (gref == 4) gref = 10;                                       \
3380         if (nref == 4) nref = 10;                                       \
3381         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3382         nbytes = 1;                                                     \
3383       }                                                                 \
3384     else                        /* new format (after ver.21) */         \
3385       {                                                                 \
3386         int c;                                                          \
3387                                                                         \
3388         ONE_MORE_BYTE (c);                                              \
3389         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3390         if (rule >= 0)                                                  \
3391           rule += 0x100;   /* to destinguish it from the old format */  \
3392         nbytes = 2;                                                     \
3393       }                                                                 \
3394   } while (0)
3395
3396 #define ENCODE_COMPOSITION_RULE(rule)                           \
3397   do {                                                          \
3398     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3399                                                                 \
3400     if (rule < 0x100)           /* old format */                \
3401       {                                                         \
3402         if (gref == 10) gref = 4;                               \
3403         if (nref == 10) nref = 4;                               \
3404         charbuf[idx] = 32 + gref * 9 + nref;                    \
3405         charbuf[idx + 1] = -1;                                  \
3406         new_chars++;                                            \
3407       }                                                         \
3408     else                                /* new format */        \
3409       {                                                         \
3410         charbuf[idx] = 32 + 81 + gref;                          \
3411         charbuf[idx + 1] = 32 + nref;                           \
3412         new_chars += 2;                                         \
3413       }                                                         \
3414   } while (0)
3415
3416 /* Finish the current composition as invalid.  */
3417
3418 static int finish_composition P_ ((int *, struct composition_status *));
3419
3420 static int
3421 finish_composition (charbuf, cmp_status)
3422      int *charbuf;
3423      struct composition_status *cmp_status;
3424 {
3425   int idx = - cmp_status->length;
3426   int new_chars;
3427
3428   /* Recover the original ESC sequence */
3429   charbuf[idx++] = ISO_CODE_ESC;
3430   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3431                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3432                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3433                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3434                     : '4');
3435   charbuf[idx++] = -2;
3436   charbuf[idx++] = 0;
3437   charbuf[idx++] = -1;
3438   new_chars = cmp_status->nchars;
3439   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3440     for (; idx < 0; idx++)
3441       {
3442         int elt = charbuf[idx];
3443
3444         if (elt == -2)
3445           {
3446             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3447             idx++;
3448           }
3449         else if (elt == -1)
3450           {
3451             charbuf[idx++] = ISO_CODE_ESC;
3452             charbuf[idx] = '0';
3453             new_chars += 2;
3454           }
3455       }
3456   cmp_status->state = COMPOSING_NO;
3457   return new_chars;
3458 }
3459
3460 /* If characers are under composition, finish the composition.  */
3461 #define MAYBE_FINISH_COMPOSITION()                              \
3462   do {                                                          \
3463     if (cmp_status->state != COMPOSING_NO)                      \
3464       char_offset += finish_composition (charbuf, cmp_status);  \
3465   } while (0)
3466
3467 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3468
3469    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3470    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3471    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3472    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3473
3474    Produce this annotation sequence now:
3475
3476    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3477 */
3478
3479 #define DECODE_COMPOSITION_START(c1)                                       \
3480   do {                                                                     \
3481     if (c1 == '0'                                                          \
3482         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3483              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3484             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3485                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3486       {                                                                    \
3487         *charbuf++ = -1;                                                   \
3488         *charbuf++= -1;                                                    \
3489         cmp_status->state = COMPOSING_CHAR;                                \
3490         cmp_status->length += 2;                                           \
3491       }                                                                    \
3492     else                                                                   \
3493       {                                                                    \
3494         MAYBE_FINISH_COMPOSITION ();                                       \
3495         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3496                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3497                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3498                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3499         cmp_status->state                                                  \
3500           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3501         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3502         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3503         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3504         coding->annotated = 1;                                             \
3505       }                                                                    \
3506   } while (0)
3507
3508
3509 /* Handle composition end sequence ESC 1.  */
3510
3511 #define DECODE_COMPOSITION_END()                                        \
3512   do {                                                                  \
3513     if (cmp_status->nchars == 0                                         \
3514         || ((cmp_status->state == COMPOSING_CHAR)                       \
3515             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3516       {                                                                 \
3517         MAYBE_FINISH_COMPOSITION ();                                    \
3518         goto invalid_code;                                              \
3519       }                                                                 \
3520     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3521       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3522     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3523       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3524     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3525     char_offset += cmp_status->nchars;                                  \
3526     cmp_status->state = COMPOSING_NO;                                   \
3527   } while (0)
3528
3529 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3530
3531 #define STORE_COMPOSITION_RULE(rule)    \
3532   do {                                  \
3533     *charbuf++ = -2;                    \
3534     *charbuf++ = rule;                  \
3535     cmp_status->length += 2;            \
3536     cmp_status->state--;                \
3537   } while (0)
3538
3539 /* Store a composed char or a component char C in charbuf, and update
3540    cmp_status.  */
3541
3542 #define STORE_COMPOSITION_CHAR(c)                                       \
3543   do {                                                                  \
3544     *charbuf++ = (c);                                                   \
3545     cmp_status->length++;                                               \
3546     if (cmp_status->state == COMPOSING_CHAR)                            \
3547       cmp_status->nchars++;                                             \
3548     else                                                                \
3549       cmp_status->ncomps++;                                             \
3550     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3551         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3552             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3553       cmp_status->state++;                                              \
3554   } while (0)
3555
3556
3557 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3558
3559 static void
3560 decode_coding_iso_2022 (coding)
3561      struct coding_system *coding;
3562 {
3563   const unsigned char *src = coding->source + coding->consumed;
3564   const unsigned char *src_end = coding->source + coding->src_bytes;
3565   const unsigned char *src_base;
3566   int *charbuf = coding->charbuf + coding->charbuf_used;
3567   /* We may produce two annocations (charset and composition) in one
3568      loop and one more charset annocation at the end.  */
3569   int *charbuf_end
3570     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3571   int consumed_chars = 0, consumed_chars_base;
3572   int multibytep = coding->src_multibyte;
3573   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3574   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3575   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3576   int charset_id_2, charset_id_3;
3577   struct charset *charset;
3578   int c;
3579   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3580   Lisp_Object attrs, charset_list;
3581   int char_offset = coding->produced_char;
3582   int last_offset = char_offset;
3583   int last_id = charset_ascii;
3584   int eol_crlf =
3585     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3586   int byte_after_cr = -1;
3587   int i;
3588
3589   CODING_GET_INFO (coding, attrs, charset_list);
3590   setup_iso_safe_charsets (attrs);
3591   /* Charset list may have been changed.  */
3592   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3593   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3594
3595   if (cmp_status->state != COMPOSING_NO)
3596     {
3597       for (i = 0; i < cmp_status->length; i++)
3598         *charbuf++ = cmp_status->carryover[i];
3599       coding->annotated = 1;
3600     }
3601
3602   while (1)
3603     {
3604       int c1, c2, c3;
3605
3606       src_base = src;
3607       consumed_chars_base = consumed_chars;
3608
3609       if (charbuf >= charbuf_end)
3610         {
3611           if (byte_after_cr >= 0)
3612             src_base--;
3613           break;
3614         }
3615
3616       if (byte_after_cr >= 0)
3617         c1 = byte_after_cr, byte_after_cr = -1;
3618       else
3619         ONE_MORE_BYTE (c1);
3620       if (c1 < 0)
3621         goto invalid_code;
3622
3623       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3624         {
3625           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3626           char_offset++;
3627           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3628           continue;
3629         }
3630
3631       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3632         {
3633           if (c1 == ISO_CODE_ESC)
3634             {
3635               if (src + 1 >= src_end)
3636                 goto no_more_source;
3637               *charbuf++ = ISO_CODE_ESC;
3638               char_offset++;
3639               if (src[0] == '%' && src[1] == '@')
3640                 {
3641                   src += 2;
3642                   consumed_chars += 2;
3643                   char_offset += 2;
3644                   /* We are sure charbuf can contain two more chars. */
3645                   *charbuf++ = '%';
3646                   *charbuf++ = '@';
3647                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3648                 }
3649             }
3650           else
3651             {
3652               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3653               char_offset++;
3654             }
3655           continue;
3656         }
3657
3658       if ((cmp_status->state == COMPOSING_RULE
3659            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3660           && c1 != ISO_CODE_ESC)
3661         {
3662           int rule, nbytes;
3663
3664           DECODE_COMPOSITION_RULE (rule, nbytes);
3665           if (rule < 0)
3666             goto invalid_code;
3667           STORE_COMPOSITION_RULE (rule);
3668           continue;
3669         }
3670
3671       /* We produce at most one character.  */
3672       switch (iso_code_class [c1])
3673         {
3674         case ISO_0x20_or_0x7F:
3675           if (charset_id_0 < 0
3676               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3677             /* This is SPACE or DEL.  */
3678             charset = CHARSET_FROM_ID (charset_ascii);
3679           else
3680             charset = CHARSET_FROM_ID (charset_id_0);
3681           break;
3682
3683         case ISO_graphic_plane_0:
3684           if (charset_id_0 < 0)
3685             charset = CHARSET_FROM_ID (charset_ascii);
3686           else
3687             charset = CHARSET_FROM_ID (charset_id_0);
3688           break;
3689
3690         case ISO_0xA0_or_0xFF:
3691           if (charset_id_1 < 0
3692               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3693               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3694             goto invalid_code;
3695           /* This is a graphic character, we fall down ... */
3696
3697         case ISO_graphic_plane_1:
3698           if (charset_id_1 < 0)
3699             goto invalid_code;
3700           charset = CHARSET_FROM_ID (charset_id_1);
3701           break;
3702
3703         case ISO_control_0:
3704           if (eol_crlf && c1 == '\r')
3705             ONE_MORE_BYTE (byte_after_cr);
3706           MAYBE_FINISH_COMPOSITION ();
3707           charset = CHARSET_FROM_ID (charset_ascii);
3708           break;
3709
3710         case ISO_control_1:
3711           goto invalid_code;
3712
3713         case ISO_shift_out:
3714           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3715               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3716             goto invalid_code;
3717           CODING_ISO_INVOCATION (coding, 0) = 1;
3718           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3719           continue;
3720
3721         case ISO_shift_in:
3722           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3723             goto invalid_code;
3724           CODING_ISO_INVOCATION (coding, 0) = 0;
3725           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3726           continue;
3727
3728         case ISO_single_shift_2_7:
3729           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS))
3730             goto invalid_code;
3731         case ISO_single_shift_2:
3732           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3733             goto invalid_code;
3734           /* SS2 is handled as an escape sequence of ESC 'N' */
3735           c1 = 'N';
3736           goto label_escape_sequence;
3737
3738         case ISO_single_shift_3:
3739           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3740             goto invalid_code;
3741           /* SS2 is handled as an escape sequence of ESC 'O' */
3742           c1 = 'O';
3743           goto label_escape_sequence;
3744
3745         case ISO_control_sequence_introducer:
3746           /* CSI is handled as an escape sequence of ESC '[' ...  */
3747           c1 = '[';
3748           goto label_escape_sequence;
3749
3750         case ISO_escape:
3751           ONE_MORE_BYTE (c1);
3752         label_escape_sequence:
3753           /* Escape sequences handled here are invocation,
3754              designation, direction specification, and character
3755              composition specification.  */
3756           switch (c1)
3757             {
3758             case '&':           /* revision of following character set */
3759               ONE_MORE_BYTE (c1);
3760               if (!(c1 >= '@' && c1 <= '~'))
3761                 goto invalid_code;
3762               ONE_MORE_BYTE (c1);
3763               if (c1 != ISO_CODE_ESC)
3764                 goto invalid_code;
3765               ONE_MORE_BYTE (c1);
3766               goto label_escape_sequence;
3767
3768             case '$':           /* designation of 2-byte character set */
3769               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3770                 goto invalid_code;
3771               {
3772                 int reg, chars96;
3773
3774                 ONE_MORE_BYTE (c1);
3775                 if (c1 >= '@' && c1 <= 'B')
3776                   {     /* designation of JISX0208.1978, GB2312.1980,
3777                            or JISX0208.1980 */
3778                     reg = 0, chars96 = 0;
3779                   }
3780                 else if (c1 >= 0x28 && c1 <= 0x2B)
3781                   { /* designation of DIMENSION2_CHARS94 character set */
3782                     reg = c1 - 0x28, chars96 = 0;
3783                     ONE_MORE_BYTE (c1);
3784                   }
3785                 else if (c1 >= 0x2C && c1 <= 0x2F)
3786                   { /* designation of DIMENSION2_CHARS96 character set */
3787                     reg = c1 - 0x2C, chars96 = 1;
3788                     ONE_MORE_BYTE (c1);
3789                   }
3790                 else
3791                   goto invalid_code;
3792                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3793                 /* We must update these variables now.  */
3794                 if (reg == 0)
3795                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3796                 else if (reg == 1)
3797                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3798                 if (chars96 < 0)
3799                   goto invalid_code;
3800               }
3801               continue;
3802
3803             case 'n':           /* invocation of locking-shift-2 */
3804               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3805                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3806                 goto invalid_code;
3807               CODING_ISO_INVOCATION (coding, 0) = 2;
3808               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3809               continue;
3810
3811             case 'o':           /* invocation of locking-shift-3 */
3812               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3813                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3814                 goto invalid_code;
3815               CODING_ISO_INVOCATION (coding, 0) = 3;
3816               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3817               continue;
3818
3819             case 'N':           /* invocation of single-shift-2 */
3820               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3821                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3822                 goto invalid_code;
3823               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3824               if (charset_id_2 < 0)
3825                 charset = CHARSET_FROM_ID (charset_ascii);
3826               else
3827                 charset = CHARSET_FROM_ID (charset_id_2);
3828               ONE_MORE_BYTE (c1);
3829               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3830                 goto invalid_code;
3831               break;
3832
3833             case 'O':           /* invocation of single-shift-3 */
3834               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3835                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3836                 goto invalid_code;
3837               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3838               if (charset_id_3 < 0)
3839                 charset = CHARSET_FROM_ID (charset_ascii);
3840               else
3841                 charset = CHARSET_FROM_ID (charset_id_3);
3842               ONE_MORE_BYTE (c1);
3843               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3844                 goto invalid_code;
3845               break;
3846
3847             case '0': case '2': case '3': case '4': /* start composition */
3848               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3849                 goto invalid_code;
3850               if (last_id != charset_ascii)
3851                 {
3852                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3853                   last_id = charset_ascii;
3854                   last_offset = char_offset;
3855                 }
3856               DECODE_COMPOSITION_START (c1);
3857               continue;
3858
3859             case '1':           /* end composition */
3860               if (cmp_status->state == COMPOSING_NO)
3861                 goto invalid_code;
3862               DECODE_COMPOSITION_END ();
3863               continue;
3864
3865             case '[':           /* specification of direction */
3866               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION))
3867                 goto invalid_code;
3868               /* For the moment, nested direction is not supported.
3869                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3870                  left-to-right, and nozero means right-to-left.  */
3871               ONE_MORE_BYTE (c1);
3872               switch (c1)
3873                 {
3874                 case ']':       /* end of the current direction */
3875                   coding->mode &= ~CODING_MODE_DIRECTION;
3876
3877                 case '0':       /* end of the current direction */
3878                 case '1':       /* start of left-to-right direction */
3879                   ONE_MORE_BYTE (c1);
3880                   if (c1 == ']')
3881                     coding->mode &= ~CODING_MODE_DIRECTION;
3882                   else
3883                     goto invalid_code;
3884                   break;
3885
3886                 case '2':       /* start of right-to-left direction */
3887                   ONE_MORE_BYTE (c1);
3888                   if (c1 == ']')
3889                     coding->mode |= CODING_MODE_DIRECTION;
3890                   else
3891                     goto invalid_code;
3892                   break;
3893
3894                 default:
3895                   goto invalid_code;
3896                 }
3897               continue;
3898
3899             case '%':
3900               ONE_MORE_BYTE (c1);
3901               if (c1 == '/')
3902                 {
3903                   /* CTEXT extended segment:
3904                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3905                      We keep these bytes as is for the moment.
3906                      They may be decoded by post-read-conversion.  */
3907                   int dim, M, L;
3908                   int size;
3909
3910                   ONE_MORE_BYTE (dim);
3911                   if (dim < 0 || dim > 4)
3912                     goto invalid_code;
3913                   ONE_MORE_BYTE (M);
3914                   if (M < 128)
3915                     goto invalid_code;
3916                   ONE_MORE_BYTE (L);
3917                   if (L < 128)
3918                     goto invalid_code;
3919                   size = ((M - 128) * 128) + (L - 128);
3920                   if (charbuf + 6 > charbuf_end)
3921                     goto break_loop;
3922                   *charbuf++ = ISO_CODE_ESC;
3923                   *charbuf++ = '%';
3924                   *charbuf++ = '/';
3925                   *charbuf++ = dim;
3926                   *charbuf++ = BYTE8_TO_CHAR (M);
3927                   *charbuf++ = BYTE8_TO_CHAR (L);
3928                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3929                 }
3930               else if (c1 == 'G')
3931                 {
3932                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3933                      ESC % G --UTF-8-BYTES-- ESC % @
3934                      We keep these bytes as is for the moment.
3935                      They may be decoded by post-read-conversion.  */
3936                   if (charbuf + 3 > charbuf_end)
3937                     goto break_loop;
3938                   *charbuf++ = ISO_CODE_ESC;
3939                   *charbuf++ = '%';
3940                   *charbuf++ = 'G';
3941                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3942                 }
3943               else
3944                 goto invalid_code;
3945               continue;
3946               break;
3947
3948             default:
3949               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3950                 goto invalid_code;
3951               {
3952                 int reg, chars96;
3953
3954                 if (c1 >= 0x28 && c1 <= 0x2B)
3955                   { /* designation of DIMENSION1_CHARS94 character set */
3956                     reg = c1 - 0x28, chars96 = 0;
3957                     ONE_MORE_BYTE (c1);
3958                   }
3959                 else if (c1 >= 0x2C && c1 <= 0x2F)
3960                   { /* designation of DIMENSION1_CHARS96 character set */
3961                     reg = c1 - 0x2C, chars96 = 1;
3962                     ONE_MORE_BYTE (c1);
3963                   }
3964                 else
3965                   goto invalid_code;
3966                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3967                 /* We must update these variables now.  */
3968                 if (reg == 0)
3969                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3970                 else if (reg == 1)
3971                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3972                 if (chars96 < 0)
3973                   goto invalid_code;
3974               }
3975               continue;
3976             }
3977         }
3978
3979       if (cmp_status->state == COMPOSING_NO
3980           && charset->id != charset_ascii
3981           && last_id != charset->id)
3982         {
3983           if (last_id != charset_ascii)
3984             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3985           last_id = charset->id;
3986           last_offset = char_offset;
3987         }
3988
3989       /* Now we know CHARSET and 1st position code C1 of a character.
3990          Produce a decoded character while getting 2nd and 3rd
3991          position codes C2, C3 if necessary.  */
3992       if (CHARSET_DIMENSION (charset) > 1)
3993         {
3994           ONE_MORE_BYTE (c2);
3995           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3996               || ((c1 & 0x80) != (c2 & 0x80)))
3997             /* C2 is not in a valid range.  */
3998             goto invalid_code;
3999           if (CHARSET_DIMENSION (charset) == 2)
4000             c1 = (c1 << 8) | c2;
4001           else
4002             {
4003               ONE_MORE_BYTE (c3);
4004               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4005                   || ((c1 & 0x80) != (c3 & 0x80)))
4006                 /* C3 is not in a valid range.  */
4007                 goto invalid_code;
4008               c1 = (c1 << 16) | (c2 << 8) | c2;
4009             }
4010         }
4011       c1 &= 0x7F7F7F;
4012       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4013       if (c < 0)
4014         {
4015           MAYBE_FINISH_COMPOSITION ();
4016           for (; src_base < src; src_base++, char_offset++)
4017             {
4018               if (ASCII_BYTE_P (*src_base))
4019                 *charbuf++ = *src_base;
4020               else
4021                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4022             }
4023         }
4024       else if (cmp_status->state == COMPOSING_NO)
4025         {
4026           *charbuf++ = c;
4027           char_offset++;
4028         }
4029       else if ((cmp_status->state == COMPOSING_CHAR
4030                 ? cmp_status->nchars
4031                 : cmp_status->ncomps)
4032                >= MAX_COMPOSITION_COMPONENTS)
4033         {
4034           /* Too long composition.  */
4035           MAYBE_FINISH_COMPOSITION ();
4036           *charbuf++ = c;
4037           char_offset++;
4038         }
4039       else
4040         STORE_COMPOSITION_CHAR (c);
4041       continue;
4042
4043     invalid_code:
4044       MAYBE_FINISH_COMPOSITION ();
4045       src = src_base;
4046       consumed_chars = consumed_chars_base;
4047       ONE_MORE_BYTE (c);
4048       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4049       char_offset++;
4050       coding->errors++;
4051       continue;
4052
4053     break_loop:
4054       break;
4055     }
4056
4057  no_more_source:
4058   if (cmp_status->state != COMPOSING_NO)
4059     {
4060       if (coding->mode & CODING_MODE_LAST_BLOCK)
4061         MAYBE_FINISH_COMPOSITION ();
4062       else
4063         {
4064           charbuf -= cmp_status->length;
4065           for (i = 0; i < cmp_status->length; i++)
4066             cmp_status->carryover[i] = charbuf[i];
4067         }
4068     }
4069   else if (last_id != charset_ascii)
4070     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4071   coding->consumed_char += consumed_chars_base;
4072   coding->consumed = src_base - coding->source;
4073   coding->charbuf_used = charbuf - coding->charbuf;
4074 }
4075
4076
4077 /* ISO2022 encoding stuff.  */
4078
4079 /*
4080    It is not enough to say just "ISO2022" on encoding, we have to
4081    specify more details.  In Emacs, each coding system of ISO2022
4082    variant has the following specifications:
4083         1. Initial designation to G0 thru G3.
4084         2. Allows short-form designation?
4085         3. ASCII should be designated to G0 before control characters?
4086         4. ASCII should be designated to G0 at end of line?
4087         5. 7-bit environment or 8-bit environment?
4088         6. Use locking-shift?
4089         7. Use Single-shift?
4090    And the following two are only for Japanese:
4091         8. Use ASCII in place of JIS0201-1976-Roman?
4092         9. Use JISX0208-1983 in place of JISX0208-1978?
4093    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4094    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4095    details.
4096 */
4097
4098 /* Produce codes (escape sequence) for designating CHARSET to graphic
4099    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4100    '@', 'A', or 'B' and the coding system CODING allows, produce
4101    designation sequence of short-form.  */
4102
4103 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4104   do {                                                                  \
4105     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4106     char *intermediate_char_94 = "()*+";                                \
4107     char *intermediate_char_96 = ",-./";                                \
4108     int revision = -1;                                                  \
4109     int c;                                                              \
4110                                                                         \
4111     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4112       revision = CHARSET_ISO_REVISION (charset);                        \
4113                                                                         \
4114     if (revision >= 0)                                                  \
4115       {                                                                 \
4116         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4117         EMIT_ONE_BYTE ('@' + revision);                                 \
4118       }                                                                 \
4119     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4120     if (CHARSET_DIMENSION (charset) == 1)                               \
4121       {                                                                 \
4122         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4123           c = intermediate_char_94[reg];                                \
4124         else                                                            \
4125           c = intermediate_char_96[reg];                                \
4126         EMIT_ONE_ASCII_BYTE (c);                                        \
4127       }                                                                 \
4128     else                                                                \
4129       {                                                                 \
4130         EMIT_ONE_ASCII_BYTE ('$');                                      \
4131         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4132           {                                                             \
4133             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4134                 || reg != 0                                             \
4135                 || final_char < '@' || final_char > 'B')                \
4136               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4137           }                                                             \
4138         else                                                            \
4139           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4140       }                                                                 \
4141     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4142                                                                         \
4143     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4144   } while (0)
4145
4146
4147 /* The following two macros produce codes (control character or escape
4148    sequence) for ISO2022 single-shift functions (single-shift-2 and
4149    single-shift-3).  */
4150
4151 #define ENCODE_SINGLE_SHIFT_2                                           \
4152   do {                                                                  \
4153     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4154       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4155     else                                                                \
4156       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4157     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4158   } while (0)
4159
4160
4161 #define ENCODE_SINGLE_SHIFT_3                                           \
4162   do {                                                                  \
4163     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4164       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4165     else                                                                \
4166       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4167     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4168   } while (0)
4169
4170
4171 /* The following four macros produce codes (control character or
4172    escape sequence) for ISO2022 locking-shift functions (shift-in,
4173    shift-out, locking-shift-2, and locking-shift-3).  */
4174
4175 #define ENCODE_SHIFT_IN                                 \
4176   do {                                                  \
4177     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4178     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4179   } while (0)
4180
4181
4182 #define ENCODE_SHIFT_OUT                                \
4183   do {                                                  \
4184     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4185     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4186   } while (0)
4187
4188
4189 #define ENCODE_LOCKING_SHIFT_2                          \
4190   do {                                                  \
4191     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4192     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4193   } while (0)
4194
4195
4196 #define ENCODE_LOCKING_SHIFT_3                          \
4197   do {                                                  \
4198     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4199     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4200   } while (0)
4201
4202
4203 /* Produce codes for a DIMENSION1 character whose character set is
4204    CHARSET and whose position-code is C1.  Designation and invocation
4205    sequences are also produced in advance if necessary.  */
4206
4207 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4208   do {                                                                  \
4209     int id = CHARSET_ID (charset);                                      \
4210                                                                         \
4211     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4212         && id == charset_ascii)                                         \
4213       {                                                                 \
4214         id = charset_jisx0201_roman;                                    \
4215         charset = CHARSET_FROM_ID (id);                                 \
4216       }                                                                 \
4217                                                                         \
4218     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4219       {                                                                 \
4220         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4221           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4222         else                                                            \
4223           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4224         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4225         break;                                                          \
4226       }                                                                 \
4227     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4228       {                                                                 \
4229         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4230         break;                                                          \
4231       }                                                                 \
4232     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4233       {                                                                 \
4234         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4235         break;                                                          \
4236       }                                                                 \
4237     else                                                                \
4238       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4239          must invoke it, or, at first, designate it to some graphic     \
4240          register.  Then repeat the loop to actually produce the        \
4241          character.  */                                                 \
4242       dst = encode_invocation_designation (charset, coding, dst,        \
4243                                            &produced_chars);            \
4244   } while (1)
4245
4246
4247 /* Produce codes for a DIMENSION2 character whose character set is
4248    CHARSET and whose position-codes are C1 and C2.  Designation and
4249    invocation codes are also produced in advance if necessary.  */
4250
4251 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4252   do {                                                                  \
4253     int id = CHARSET_ID (charset);                                      \
4254                                                                         \
4255     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4256         && id == charset_jisx0208)                                      \
4257       {                                                                 \
4258         id = charset_jisx0208_1978;                                     \
4259         charset = CHARSET_FROM_ID (id);                                 \
4260       }                                                                 \
4261                                                                         \
4262     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4263       {                                                                 \
4264         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4265           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4266         else                                                            \
4267           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4268         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4269         break;                                                          \
4270       }                                                                 \
4271     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4272       {                                                                 \
4273         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4274         break;                                                          \
4275       }                                                                 \
4276     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4277       {                                                                 \
4278         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4279         break;                                                          \
4280       }                                                                 \
4281     else                                                                \
4282       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4283          must invoke it, or, at first, designate it to some graphic     \
4284          register.  Then repeat the loop to actually produce the        \
4285          character.  */                                                 \
4286       dst = encode_invocation_designation (charset, coding, dst,        \
4287                                            &produced_chars);            \
4288   } while (1)
4289
4290
4291 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4292   do {                                                                     \
4293     int code = ENCODE_CHAR ((charset),(c));                                \
4294                                                                            \
4295     if (CHARSET_DIMENSION (charset) == 1)                                  \
4296       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4297     else                                                                   \
4298       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4299   } while (0)
4300
4301
4302 /* Produce designation and invocation codes at a place pointed by DST
4303    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4304    Return new DST.  */
4305
4306 unsigned char *
4307 encode_invocation_designation (charset, coding, dst, p_nchars)
4308      struct charset *charset;
4309      struct coding_system *coding;
4310      unsigned char *dst;
4311      int *p_nchars;
4312 {
4313   int multibytep = coding->dst_multibyte;
4314   int produced_chars = *p_nchars;
4315   int reg;                      /* graphic register number */
4316   int id = CHARSET_ID (charset);
4317
4318   /* At first, check designations.  */
4319   for (reg = 0; reg < 4; reg++)
4320     if (id == CODING_ISO_DESIGNATION (coding, reg))
4321       break;
4322
4323   if (reg >= 4)
4324     {
4325       /* CHARSET is not yet designated to any graphic registers.  */
4326       /* At first check the requested designation.  */
4327       reg = CODING_ISO_REQUEST (coding, id);
4328       if (reg < 0)
4329         /* Since CHARSET requests no special designation, designate it
4330            to graphic register 0.  */
4331         reg = 0;
4332
4333       ENCODE_DESIGNATION (charset, reg, coding);
4334     }
4335
4336   if (CODING_ISO_INVOCATION (coding, 0) != reg
4337       && CODING_ISO_INVOCATION (coding, 1) != reg)
4338     {
4339       /* Since the graphic register REG is not invoked to any graphic
4340          planes, invoke it to graphic plane 0.  */
4341       switch (reg)
4342         {
4343         case 0:                 /* graphic register 0 */
4344           ENCODE_SHIFT_IN;
4345           break;
4346
4347         case 1:                 /* graphic register 1 */
4348           ENCODE_SHIFT_OUT;
4349           break;
4350
4351         case 2:                 /* graphic register 2 */
4352           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4353             ENCODE_SINGLE_SHIFT_2;
4354           else
4355             ENCODE_LOCKING_SHIFT_2;
4356           break;
4357
4358         case 3:                 /* graphic register 3 */
4359           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4360             ENCODE_SINGLE_SHIFT_3;
4361           else
4362             ENCODE_LOCKING_SHIFT_3;
4363           break;
4364         }
4365     }
4366
4367   *p_nchars = produced_chars;
4368   return dst;
4369 }
4370
4371 /* The following three macros produce codes for indicating direction
4372    of text.  */
4373 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4374   do {                                                                  \
4375     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4376       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4377     else                                                                \
4378       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4379   } while (0)
4380
4381
4382 #define ENCODE_DIRECTION_R2L()                  \
4383   do {                                          \
4384     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4385     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4386   } while (0)
4387
4388
4389 #define ENCODE_DIRECTION_L2R()                  \
4390   do {                                          \
4391     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4392     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4393   } while (0)
4394
4395
4396 /* Produce codes for designation and invocation to reset the graphic
4397    planes and registers to initial state.  */
4398 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4399   do {                                                                  \
4400     int reg;                                                            \
4401     struct charset *charset;                                            \
4402                                                                         \
4403     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4404       ENCODE_SHIFT_IN;                                                  \
4405     for (reg = 0; reg < 4; reg++)                                       \
4406       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4407           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4408               != CODING_ISO_INITIAL (coding, reg)))                     \
4409         {                                                               \
4410           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4411           ENCODE_DESIGNATION (charset, reg, coding);                    \
4412         }                                                               \
4413   } while (0)
4414
4415
4416 /* Produce designation sequences of charsets in the line started from
4417    SRC to a place pointed by DST, and return updated DST.
4418
4419    If the current block ends before any end-of-line, we may fail to
4420    find all the necessary designations.  */
4421
4422 static unsigned char *
4423 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4424      struct coding_system *coding;
4425      int *charbuf, *charbuf_end;
4426      unsigned char *dst;
4427 {
4428   struct charset *charset;
4429   /* Table of charsets to be designated to each graphic register.  */
4430   int r[4];
4431   int c, found = 0, reg;
4432   int produced_chars = 0;
4433   int multibytep = coding->dst_multibyte;
4434   Lisp_Object attrs;
4435   Lisp_Object charset_list;
4436
4437   attrs = CODING_ID_ATTRS (coding->id);
4438   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4439   if (EQ (charset_list, Qiso_2022))
4440     charset_list = Viso_2022_charset_list;
4441
4442   for (reg = 0; reg < 4; reg++)
4443     r[reg] = -1;
4444
4445   while (found < 4)
4446     {
4447       int id;
4448
4449       c = *charbuf++;
4450       if (c == '\n')
4451         break;
4452       charset = char_charset (c, charset_list, NULL);
4453       id = CHARSET_ID (charset);
4454       reg = CODING_ISO_REQUEST (coding, id);
4455       if (reg >= 0 && r[reg] < 0)
4456         {
4457           found++;
4458           r[reg] = id;
4459         }
4460     }
4461
4462   if (found)
4463     {
4464       for (reg = 0; reg < 4; reg++)
4465         if (r[reg] >= 0
4466             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4467           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4468     }
4469
4470   return dst;
4471 }
4472
4473 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4474
4475 static int
4476 encode_coding_iso_2022 (coding)
4477      struct coding_system *coding;
4478 {
4479   int multibytep = coding->dst_multibyte;
4480   int *charbuf = coding->charbuf;
4481   int *charbuf_end = charbuf + coding->charbuf_used;
4482   unsigned char *dst = coding->destination + coding->produced;
4483   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4484   int safe_room = 16;
4485   int bol_designation
4486     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4487        && CODING_ISO_BOL (coding));
4488   int produced_chars = 0;
4489   Lisp_Object attrs, eol_type, charset_list;
4490   int ascii_compatible;
4491   int c;
4492   int preferred_charset_id = -1;
4493
4494   CODING_GET_INFO (coding, attrs, charset_list);
4495   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4496   if (VECTORP (eol_type))
4497     eol_type = Qunix;
4498
4499   setup_iso_safe_charsets (attrs);
4500   /* Charset list may have been changed.  */
4501   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4502   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4503
4504   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4505
4506   while (charbuf < charbuf_end)
4507     {
4508       ASSURE_DESTINATION (safe_room);
4509
4510       if (bol_designation)
4511         {
4512           unsigned char *dst_prev = dst;
4513
4514           /* We have to produce designation sequences if any now.  */
4515           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4516           bol_designation = 0;
4517           /* We are sure that designation sequences are all ASCII bytes.  */
4518           produced_chars += dst - dst_prev;
4519         }
4520
4521       c = *charbuf++;
4522
4523       if (c < 0)
4524         {
4525           /* Handle an annotation.  */
4526           switch (*charbuf)
4527             {
4528             case CODING_ANNOTATE_COMPOSITION_MASK:
4529               /* Not yet implemented.  */
4530               break;
4531             case CODING_ANNOTATE_CHARSET_MASK:
4532               preferred_charset_id = charbuf[2];
4533               if (preferred_charset_id >= 0
4534                   && NILP (Fmemq (make_number (preferred_charset_id),
4535                                   charset_list)))
4536                 preferred_charset_id = -1;
4537               break;
4538             default:
4539               abort ();
4540             }
4541           charbuf += -c - 1;
4542           continue;
4543         }
4544
4545       /* Now encode the character C.  */
4546       if (c < 0x20 || c == 0x7F)
4547         {
4548           if (c == '\n'
4549               || (c == '\r' && EQ (eol_type, Qmac)))
4550             {
4551               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4552                 ENCODE_RESET_PLANE_AND_REGISTER ();
4553               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4554                 {
4555                   int i;
4556
4557                   for (i = 0; i < 4; i++)
4558                     CODING_ISO_DESIGNATION (coding, i)
4559                       = CODING_ISO_INITIAL (coding, i);
4560                 }
4561               bol_designation
4562                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4563             }
4564           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4565             ENCODE_RESET_PLANE_AND_REGISTER ();
4566           EMIT_ONE_ASCII_BYTE (c);
4567         }
4568       else if (ASCII_CHAR_P (c))
4569         {
4570           if (ascii_compatible)
4571             EMIT_ONE_ASCII_BYTE (c);
4572           else
4573             {
4574               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4575               ENCODE_ISO_CHARACTER (charset, c);
4576             }
4577         }
4578       else if (CHAR_BYTE8_P (c))
4579         {
4580           c = CHAR_TO_BYTE8 (c);
4581           EMIT_ONE_BYTE (c);
4582         }
4583       else
4584         {
4585           struct charset *charset;
4586
4587           if (preferred_charset_id >= 0)
4588             {
4589               charset = CHARSET_FROM_ID (preferred_charset_id);
4590               if (! CHAR_CHARSET_P (c, charset))
4591                 charset = char_charset (c, charset_list, NULL);
4592             }
4593           else
4594             charset = char_charset (c, charset_list, NULL);
4595           if (!charset)
4596             {
4597               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4598                 {
4599                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4600                   charset = CHARSET_FROM_ID (charset_ascii);
4601                 }
4602               else
4603                 {
4604                   c = coding->default_char;
4605                   charset = char_charset (c, charset_list, NULL);
4606                 }
4607             }
4608           ENCODE_ISO_CHARACTER (charset, c);
4609         }
4610     }
4611
4612   if (coding->mode & CODING_MODE_LAST_BLOCK
4613       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4614     {
4615       ASSURE_DESTINATION (safe_room);
4616       ENCODE_RESET_PLANE_AND_REGISTER ();
4617     }
4618   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4619   CODING_ISO_BOL (coding) = bol_designation;
4620   coding->produced_char += produced_chars;
4621   coding->produced = dst - coding->destination;
4622   return 0;
4623 }
4624
4625 \f
4626 /*** 8,9. SJIS and BIG5 handlers ***/
4627
4628 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4629    quite widely.  So, for the moment, Emacs supports them in the bare
4630    C code.  But, in the future, they may be supported only by CCL.  */
4631
4632 /* SJIS is a coding system encoding three character sets: ASCII, right
4633    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4634    as is.  A character of charset katakana-jisx0201 is encoded by
4635    "position-code + 0x80".  A character of charset japanese-jisx0208
4636    is encoded in 2-byte but two position-codes are divided and shifted
4637    so that it fit in the range below.
4638
4639    --- CODE RANGE of SJIS ---
4640    (character set)      (range)
4641    ASCII                0x00 .. 0x7F
4642    KATAKANA-JISX0201    0xA0 .. 0xDF
4643    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4644             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4645    -------------------------------
4646
4647 */
4648
4649 /* BIG5 is a coding system encoding two character sets: ASCII and
4650    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4651    character set and is encoded in two-byte.
4652
4653    --- CODE RANGE of BIG5 ---
4654    (character set)      (range)
4655    ASCII                0x00 .. 0x7F
4656    Big5 (1st byte)      0xA1 .. 0xFE
4657         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4658    --------------------------
4659
4660   */
4661
4662 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4663    Check if a text is encoded in SJIS.  If it is, return
4664    CATEGORY_MASK_SJIS, else return 0.  */
4665
4666 static int
4667 detect_coding_sjis (coding, detect_info)
4668      struct coding_system *coding;
4669      struct coding_detection_info *detect_info;
4670 {
4671   const unsigned char *src = coding->source, *src_base;
4672   const unsigned char *src_end = coding->source + coding->src_bytes;
4673   int multibytep = coding->src_multibyte;
4674   int consumed_chars = 0;
4675   int found = 0;
4676   int c;
4677   Lisp_Object attrs, charset_list;
4678   int max_first_byte_of_2_byte_code;
4679
4680   CODING_GET_INFO (coding, attrs, charset_list);
4681   max_first_byte_of_2_byte_code
4682     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4683
4684   detect_info->checked |= CATEGORY_MASK_SJIS;
4685   /* A coding system of this category is always ASCII compatible.  */
4686   src += coding->head_ascii;
4687
4688   while (1)
4689     {
4690       src_base = src;
4691       ONE_MORE_BYTE (c);
4692       if (c < 0x80)
4693         continue;
4694       if ((c >= 0x81 && c <= 0x9F)
4695           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4696         {
4697           ONE_MORE_BYTE (c);
4698           if (c < 0x40 || c == 0x7F || c > 0xFC)
4699             break;
4700           found = CATEGORY_MASK_SJIS;
4701         }
4702       else if (c >= 0xA0 && c < 0xE0)
4703         found = CATEGORY_MASK_SJIS;
4704       else
4705         break;
4706     }
4707   detect_info->rejected |= CATEGORY_MASK_SJIS;
4708   return 0;
4709
4710  no_more_source:
4711   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4712     {
4713       detect_info->rejected |= CATEGORY_MASK_SJIS;
4714       return 0;
4715     }
4716   detect_info->found |= found;
4717   return 1;
4718 }
4719
4720 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4721    Check if a text is encoded in BIG5.  If it is, return
4722    CATEGORY_MASK_BIG5, else return 0.  */
4723
4724 static int
4725 detect_coding_big5 (coding, detect_info)
4726      struct coding_system *coding;
4727      struct coding_detection_info *detect_info;
4728 {
4729   const unsigned char *src = coding->source, *src_base;
4730   const unsigned char *src_end = coding->source + coding->src_bytes;
4731   int multibytep = coding->src_multibyte;
4732   int consumed_chars = 0;
4733   int found = 0;
4734   int c;
4735
4736   detect_info->checked |= CATEGORY_MASK_BIG5;
4737   /* A coding system of this category is always ASCII compatible.  */
4738   src += coding->head_ascii;
4739
4740   while (1)
4741     {
4742       src_base = src;
4743       ONE_MORE_BYTE (c);
4744       if (c < 0x80)
4745         continue;
4746       if (c >= 0xA1)
4747         {
4748           ONE_MORE_BYTE (c);
4749           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4750             return 0;
4751           found = CATEGORY_MASK_BIG5;
4752         }
4753       else
4754         break;
4755     }
4756   detect_info->rejected |= CATEGORY_MASK_BIG5;
4757   return 0;
4758
4759  no_more_source:
4760   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4761     {
4762       detect_info->rejected |= CATEGORY_MASK_BIG5;
4763       return 0;
4764     }
4765   detect_info->found |= found;
4766   return 1;
4767 }
4768
4769 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4770    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4771
4772 static void
4773 decode_coding_sjis (coding)
4774      struct coding_system *coding;
4775 {
4776   const unsigned char *src = coding->source + coding->consumed;
4777   const unsigned char *src_end = coding->source + coding->src_bytes;
4778   const unsigned char *src_base;
4779   int *charbuf = coding->charbuf + coding->charbuf_used;
4780   /* We may produce one charset annocation in one loop and one more at
4781      the end.  */
4782   int *charbuf_end
4783     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4784   int consumed_chars = 0, consumed_chars_base;
4785   int multibytep = coding->src_multibyte;
4786   struct charset *charset_roman, *charset_kanji, *charset_kana;
4787   struct charset *charset_kanji2;
4788   Lisp_Object attrs, charset_list, val;
4789   int char_offset = coding->produced_char;
4790   int last_offset = char_offset;
4791   int last_id = charset_ascii;
4792   int eol_crlf =
4793     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4794   int byte_after_cr = -1;
4795
4796   CODING_GET_INFO (coding, attrs, charset_list);
4797
4798   val = charset_list;
4799   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4800   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4801   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4802   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4803
4804   while (1)
4805     {
4806       int c, c1;
4807       struct charset *charset;
4808
4809       src_base = src;
4810       consumed_chars_base = consumed_chars;
4811
4812       if (charbuf >= charbuf_end)
4813         {
4814           if (byte_after_cr >= 0)
4815             src_base--;
4816           break;
4817         }
4818
4819       if (byte_after_cr >= 0)
4820         c = byte_after_cr, byte_after_cr = -1;
4821       else
4822         ONE_MORE_BYTE (c);
4823       if (c < 0)
4824         goto invalid_code;
4825       if (c < 0x80)
4826         {
4827           if (eol_crlf && c == '\r')
4828             ONE_MORE_BYTE (byte_after_cr);
4829           charset = charset_roman;
4830         }
4831       else if (c == 0x80 || c == 0xA0)
4832         goto invalid_code;
4833       else if (c >= 0xA1 && c <= 0xDF)
4834         {
4835           /* SJIS -> JISX0201-Kana */
4836           c &= 0x7F;
4837           charset = charset_kana;
4838         }
4839       else if (c <= 0xEF)
4840         {
4841           /* SJIS -> JISX0208 */
4842           ONE_MORE_BYTE (c1);
4843           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4844             goto invalid_code;
4845           c = (c << 8) | c1;
4846           SJIS_TO_JIS (c);
4847           charset = charset_kanji;
4848         }
4849       else if (c <= 0xFC && charset_kanji2)
4850         {
4851           /* SJIS -> JISX0213-2 */
4852           ONE_MORE_BYTE (c1);
4853           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4854             goto invalid_code;
4855           c = (c << 8) | c1;
4856           SJIS_TO_JIS2 (c);
4857           charset = charset_kanji2;
4858         }
4859       else
4860         goto invalid_code;
4861       if (charset->id != charset_ascii
4862           && last_id != charset->id)
4863         {
4864           if (last_id != charset_ascii)
4865             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4866           last_id = charset->id;
4867           last_offset = char_offset;
4868         }
4869       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4870       *charbuf++ = c;
4871       char_offset++;
4872       continue;
4873
4874     invalid_code:
4875       src = src_base;
4876       consumed_chars = consumed_chars_base;
4877       ONE_MORE_BYTE (c);
4878       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4879       char_offset++;
4880       coding->errors++;
4881     }
4882
4883  no_more_source:
4884   if (last_id != charset_ascii)
4885     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4886   coding->consumed_char += consumed_chars_base;
4887   coding->consumed = src_base - coding->source;
4888   coding->charbuf_used = charbuf - coding->charbuf;
4889 }
4890
4891 static void
4892 decode_coding_big5 (coding)
4893      struct coding_system *coding;
4894 {
4895   const unsigned char *src = coding->source + coding->consumed;
4896   const unsigned char *src_end = coding->source + coding->src_bytes;
4897   const unsigned char *src_base;
4898   int *charbuf = coding->charbuf + coding->charbuf_used;
4899   /* We may produce one charset annocation in one loop and one more at
4900      the end.  */
4901   int *charbuf_end
4902     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4903   int consumed_chars = 0, consumed_chars_base;
4904   int multibytep = coding->src_multibyte;
4905   struct charset *charset_roman, *charset_big5;
4906   Lisp_Object attrs, charset_list, val;
4907   int char_offset = coding->produced_char;
4908   int last_offset = char_offset;
4909   int last_id = charset_ascii;
4910   int eol_crlf =
4911     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4912   int byte_after_cr = -1;
4913
4914   CODING_GET_INFO (coding, attrs, charset_list);
4915   val = charset_list;
4916   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4917   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4918
4919   while (1)
4920     {
4921       int c, c1;
4922       struct charset *charset;
4923
4924       src_base = src;
4925       consumed_chars_base = consumed_chars;
4926
4927       if (charbuf >= charbuf_end)
4928         {
4929           if (byte_after_cr >= 0)
4930             src_base--;
4931           break;
4932         }
4933
4934       if (byte_after_cr >= 0)
4935         c = byte_after_cr, byte_after_cr = -1;
4936       else
4937         ONE_MORE_BYTE (c);
4938
4939       if (c < 0)
4940         goto invalid_code;
4941       if (c < 0x80)
4942         {
4943           if (eol_crlf && c == '\r')
4944             ONE_MORE_BYTE (byte_after_cr);
4945           charset = charset_roman;
4946         }
4947       else
4948         {
4949           /* BIG5 -> Big5 */
4950           if (c < 0xA1 || c > 0xFE)
4951             goto invalid_code;
4952           ONE_MORE_BYTE (c1);
4953           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4954             goto invalid_code;
4955           c = c << 8 | c1;
4956           charset = charset_big5;
4957         }
4958       if (charset->id != charset_ascii
4959           && last_id != charset->id)
4960         {
4961           if (last_id != charset_ascii)
4962             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4963           last_id = charset->id;
4964           last_offset = char_offset;
4965         }
4966       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4967       *charbuf++ = c;
4968       char_offset++;
4969       continue;
4970
4971     invalid_code:
4972       src = src_base;
4973       consumed_chars = consumed_chars_base;
4974       ONE_MORE_BYTE (c);
4975       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4976       char_offset++;
4977       coding->errors++;
4978     }
4979
4980  no_more_source:
4981   if (last_id != charset_ascii)
4982     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4983   coding->consumed_char += consumed_chars_base;
4984   coding->consumed = src_base - coding->source;
4985   coding->charbuf_used = charbuf - coding->charbuf;
4986 }
4987
4988 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4989    This function can encode charsets `ascii', `katakana-jisx0201',
4990    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4991    are sure that all these charsets are registered as official charset
4992    (i.e. do not have extended leading-codes).  Characters of other
4993    charsets are produced without any encoding.  If SJIS_P is 1, encode
4994    SJIS text, else encode BIG5 text.  */
4995
4996 static int
4997 encode_coding_sjis (coding)
4998      struct coding_system *coding;
4999 {
5000   int multibytep = coding->dst_multibyte;
5001   int *charbuf = coding->charbuf;
5002   int *charbuf_end = charbuf + coding->charbuf_used;
5003   unsigned char *dst = coding->destination + coding->produced;
5004   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5005   int safe_room = 4;
5006   int produced_chars = 0;
5007   Lisp_Object attrs, charset_list, val;
5008   int ascii_compatible;
5009   struct charset *charset_roman, *charset_kanji, *charset_kana;
5010   struct charset *charset_kanji2;
5011   int c;
5012
5013   CODING_GET_INFO (coding, attrs, charset_list);
5014   val = charset_list;
5015   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5016   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5017   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5018   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5019
5020   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5021
5022   while (charbuf < charbuf_end)
5023     {
5024       ASSURE_DESTINATION (safe_room);
5025       c = *charbuf++;
5026       /* Now encode the character C.  */
5027       if (ASCII_CHAR_P (c) && ascii_compatible)
5028         EMIT_ONE_ASCII_BYTE (c);
5029       else if (CHAR_BYTE8_P (c))
5030         {
5031           c = CHAR_TO_BYTE8 (c);
5032           EMIT_ONE_BYTE (c);
5033         }
5034       else
5035         {
5036           unsigned code;
5037           struct charset *charset = char_charset (c, charset_list, &code);
5038
5039           if (!charset)
5040             {
5041               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5042                 {
5043                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5044                   charset = CHARSET_FROM_ID (charset_ascii);
5045                 }
5046               else
5047                 {
5048                   c = coding->default_char;
5049                   charset = char_charset (c, charset_list, &code);
5050                 }
5051             }
5052           if (code == CHARSET_INVALID_CODE (charset))
5053             abort ();
5054           if (charset == charset_kanji)
5055             {
5056               int c1, c2;
5057               JIS_TO_SJIS (code);
5058               c1 = code >> 8, c2 = code & 0xFF;
5059               EMIT_TWO_BYTES (c1, c2);
5060             }
5061           else if (charset == charset_kana)
5062             EMIT_ONE_BYTE (code | 0x80);
5063           else if (charset_kanji2 && charset == charset_kanji2)
5064             {
5065               int c1, c2;
5066
5067               c1 = code >> 8;
5068               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5069                   || c1 == 0x28
5070                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5071                 {
5072                   JIS_TO_SJIS2 (code);
5073                   c1 = code >> 8, c2 = code & 0xFF;
5074                   EMIT_TWO_BYTES (c1, c2);
5075                 }
5076               else
5077                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5078             }
5079           else
5080             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5081         }
5082     }
5083   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5084   coding->produced_char += produced_chars;
5085   coding->produced = dst - coding->destination;
5086   return 0;
5087 }
5088
5089 static int
5090 encode_coding_big5 (coding)
5091      struct coding_system *coding;
5092 {
5093   int multibytep = coding->dst_multibyte;
5094   int *charbuf = coding->charbuf;
5095   int *charbuf_end = charbuf + coding->charbuf_used;
5096   unsigned char *dst = coding->destination + coding->produced;
5097   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5098   int safe_room = 4;
5099   int produced_chars = 0;
5100   Lisp_Object attrs, charset_list, val;
5101   int ascii_compatible;
5102   struct charset *charset_roman, *charset_big5;
5103   int c;
5104
5105   CODING_GET_INFO (coding, attrs, charset_list);
5106   val = charset_list;
5107   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5108   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5109   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5110
5111   while (charbuf < charbuf_end)
5112     {
5113       ASSURE_DESTINATION (safe_room);
5114       c = *charbuf++;
5115       /* Now encode the character C.  */
5116       if (ASCII_CHAR_P (c) && ascii_compatible)
5117         EMIT_ONE_ASCII_BYTE (c);
5118       else if (CHAR_BYTE8_P (c))
5119         {
5120           c = CHAR_TO_BYTE8 (c);
5121           EMIT_ONE_BYTE (c);
5122         }
5123       else
5124         {
5125           unsigned code;
5126           struct charset *charset = char_charset (c, charset_list, &code);
5127
5128           if (! charset)
5129             {
5130               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5131                 {
5132                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5133                   charset = CHARSET_FROM_ID (charset_ascii);
5134                 }
5135               else
5136                 {
5137                   c = coding->default_char;
5138                   charset = char_charset (c, charset_list, &code);
5139                 }
5140             }
5141           if (code == CHARSET_INVALID_CODE (charset))
5142             abort ();
5143           if (charset == charset_big5)
5144             {
5145               int c1, c2;
5146
5147               c1 = code >> 8, c2 = code & 0xFF;
5148               EMIT_TWO_BYTES (c1, c2);
5149             }
5150           else
5151             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5152         }
5153     }
5154   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5155   coding->produced_char += produced_chars;
5156   coding->produced = dst - coding->destination;
5157   return 0;
5158 }
5159
5160 \f
5161 /*** 10. CCL handlers ***/
5162
5163 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5164    Check if a text is encoded in a coding system of which
5165    encoder/decoder are written in CCL program.  If it is, return
5166    CATEGORY_MASK_CCL, else return 0.  */
5167
5168 static int
5169 detect_coding_ccl (coding, detect_info)
5170      struct coding_system *coding;
5171      struct coding_detection_info *detect_info;
5172 {
5173   const unsigned char *src = coding->source, *src_base;
5174   const unsigned char *src_end = coding->source + coding->src_bytes;
5175   int multibytep = coding->src_multibyte;
5176   int consumed_chars = 0;
5177   int found = 0;
5178   unsigned char *valids;
5179   int head_ascii = coding->head_ascii;
5180   Lisp_Object attrs;
5181
5182   detect_info->checked |= CATEGORY_MASK_CCL;
5183
5184   coding = &coding_categories[coding_category_ccl];
5185   valids = CODING_CCL_VALIDS (coding);
5186   attrs = CODING_ID_ATTRS (coding->id);
5187   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5188     src += head_ascii;
5189
5190   while (1)
5191     {
5192       int c;
5193
5194       src_base = src;
5195       ONE_MORE_BYTE (c);
5196       if (c < 0 || ! valids[c])
5197         break;
5198       if ((valids[c] > 1))
5199         found = CATEGORY_MASK_CCL;
5200     }
5201   detect_info->rejected |= CATEGORY_MASK_CCL;
5202   return 0;
5203
5204  no_more_source:
5205   detect_info->found |= found;
5206   return 1;
5207 }
5208
5209 static void
5210 decode_coding_ccl (coding)
5211      struct coding_system *coding;
5212 {
5213   const unsigned char *src = coding->source + coding->consumed;
5214   const unsigned char *src_end = coding->source + coding->src_bytes;
5215   int *charbuf = coding->charbuf + coding->charbuf_used;
5216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5217   int consumed_chars = 0;
5218   int multibytep = coding->src_multibyte;
5219   struct ccl_program ccl;
5220   int source_charbuf[1024];
5221   int source_byteidx[1024];
5222   Lisp_Object attrs, charset_list;
5223
5224   CODING_GET_INFO (coding, attrs, charset_list);
5225   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5226
5227   while (src < src_end)
5228     {
5229       const unsigned char *p = src;
5230       int *source, *source_end;
5231       int i = 0;
5232
5233       if (multibytep)
5234         while (i < 1024 && p < src_end)
5235           {
5236             source_byteidx[i] = p - src;
5237             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5238           }
5239       else
5240         while (i < 1024 && p < src_end)
5241           source_charbuf[i++] = *p++;
5242
5243       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5244         ccl.last_block = 1;
5245
5246       source = source_charbuf;
5247       source_end = source + i;
5248       while (source < source_end)
5249         {
5250           ccl_driver (&ccl, source, charbuf,
5251                       source_end - source, charbuf_end - charbuf,
5252                       charset_list);
5253           source += ccl.consumed;
5254           charbuf += ccl.produced;
5255           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5256             break;
5257         }
5258       if (source < source_end)
5259         src += source_byteidx[source - source_charbuf];
5260       else
5261         src = p;
5262       consumed_chars += source - source_charbuf;
5263
5264       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5265           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5266         break;
5267     }
5268
5269   switch (ccl.status)
5270     {
5271     case CCL_STAT_SUSPEND_BY_SRC:
5272       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5273       break;
5274     case CCL_STAT_SUSPEND_BY_DST:
5275       break;
5276     case CCL_STAT_QUIT:
5277     case CCL_STAT_INVALID_CMD:
5278       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5279       break;
5280     default:
5281       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5282       break;
5283     }
5284   coding->consumed_char += consumed_chars;
5285   coding->consumed = src - coding->source;
5286   coding->charbuf_used = charbuf - coding->charbuf;
5287 }
5288
5289 static int
5290 encode_coding_ccl (coding)
5291      struct coding_system *coding;
5292 {
5293   struct ccl_program ccl;
5294   int multibytep = coding->dst_multibyte;
5295   int *charbuf = coding->charbuf;
5296   int *charbuf_end = charbuf + coding->charbuf_used;
5297   unsigned char *dst = coding->destination + coding->produced;
5298   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5299   int destination_charbuf[1024];
5300   int i, produced_chars = 0;
5301   Lisp_Object attrs, charset_list;
5302
5303   CODING_GET_INFO (coding, attrs, charset_list);
5304   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5305
5306   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5307   ccl.dst_multibyte = coding->dst_multibyte;
5308
5309   while (charbuf < charbuf_end)
5310     {
5311       ccl_driver (&ccl, charbuf, destination_charbuf,
5312                   charbuf_end - charbuf, 1024, charset_list);
5313       if (multibytep)
5314         {
5315           ASSURE_DESTINATION (ccl.produced * 2);
5316           for (i = 0; i < ccl.produced; i++)
5317             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5318         }
5319       else
5320         {
5321           ASSURE_DESTINATION (ccl.produced);
5322           for (i = 0; i < ccl.produced; i++)
5323             *dst++ = destination_charbuf[i] & 0xFF;
5324           produced_chars += ccl.produced;
5325         }
5326       charbuf += ccl.consumed;
5327       if (ccl.status == CCL_STAT_QUIT
5328           || ccl.status == CCL_STAT_INVALID_CMD)
5329         break;
5330     }
5331
5332   switch (ccl.status)
5333     {
5334     case CCL_STAT_SUSPEND_BY_SRC:
5335       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5336       break;
5337     case CCL_STAT_SUSPEND_BY_DST:
5338       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5339       break;
5340     case CCL_STAT_QUIT:
5341     case CCL_STAT_INVALID_CMD:
5342       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5343       break;
5344     default:
5345       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5346       break;
5347     }
5348
5349   coding->produced_char += produced_chars;
5350   coding->produced = dst - coding->destination;
5351   return 0;
5352 }
5353
5354
5355 \f
5356 /*** 10, 11. no-conversion handlers ***/
5357
5358 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5359
5360 static void
5361 decode_coding_raw_text (coding)
5362      struct coding_system *coding;
5363 {
5364   int eol_crlf =
5365     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5366
5367   coding->chars_at_source = 1;
5368   coding->consumed_char = coding->src_chars;
5369   coding->consumed = coding->src_bytes;
5370   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5371     {
5372       coding->consumed_char--;
5373       coding->consumed--;
5374       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5375     }
5376   else
5377     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5378 }
5379
5380 static int
5381 encode_coding_raw_text (coding)
5382      struct coding_system *coding;
5383 {
5384   int multibytep = coding->dst_multibyte;
5385   int *charbuf = coding->charbuf;
5386   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5387   unsigned char *dst = coding->destination + coding->produced;
5388   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5389   int produced_chars = 0;
5390   int c;
5391
5392   if (multibytep)
5393     {
5394       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5395
5396       if (coding->src_multibyte)
5397         while (charbuf < charbuf_end)
5398           {
5399             ASSURE_DESTINATION (safe_room);
5400             c = *charbuf++;
5401             if (ASCII_CHAR_P (c))
5402               EMIT_ONE_ASCII_BYTE (c);
5403             else if (CHAR_BYTE8_P (c))
5404               {
5405                 c = CHAR_TO_BYTE8 (c);
5406                 EMIT_ONE_BYTE (c);
5407               }
5408             else
5409               {
5410                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5411
5412                 CHAR_STRING_ADVANCE (c, p1);
5413                 while (p0 < p1)
5414                   {
5415                     EMIT_ONE_BYTE (*p0);
5416                     p0++;
5417                   }
5418               }
5419           }
5420       else
5421         while (charbuf < charbuf_end)
5422           {
5423             ASSURE_DESTINATION (safe_room);
5424             c = *charbuf++;
5425             EMIT_ONE_BYTE (c);
5426           }
5427     }
5428   else
5429     {
5430       if (coding->src_multibyte)
5431         {
5432           int safe_room = MAX_MULTIBYTE_LENGTH;
5433
5434           while (charbuf < charbuf_end)
5435             {
5436               ASSURE_DESTINATION (safe_room);
5437               c = *charbuf++;
5438               if (ASCII_CHAR_P (c))
5439                 *dst++ = c;
5440               else if (CHAR_BYTE8_P (c))
5441                 *dst++ = CHAR_TO_BYTE8 (c);
5442               else
5443                 CHAR_STRING_ADVANCE (c, dst);
5444             }
5445         }
5446       else
5447         {
5448           ASSURE_DESTINATION (charbuf_end - charbuf);
5449           while (charbuf < charbuf_end && dst < dst_end)
5450             *dst++ = *charbuf++;
5451         }
5452       produced_chars = dst - (coding->destination + coding->produced);
5453     }
5454   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5455   coding->produced_char += produced_chars;
5456   coding->produced = dst - coding->destination;
5457   return 0;
5458 }
5459
5460 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5461    Check if a text is encoded in a charset-based coding system.  If it
5462    is, return 1, else return 0.  */
5463
5464 static int
5465 detect_coding_charset (coding, detect_info)
5466      struct coding_system *coding;
5467      struct coding_detection_info *detect_info;
5468 {
5469   const unsigned char *src = coding->source, *src_base;
5470   const unsigned char *src_end = coding->source + coding->src_bytes;
5471   int multibytep = coding->src_multibyte;
5472   int consumed_chars = 0;
5473   Lisp_Object attrs, valids, name;
5474   int found = 0;
5475   int head_ascii = coding->head_ascii;
5476   int check_latin_extra = 0;
5477
5478   detect_info->checked |= CATEGORY_MASK_CHARSET;
5479
5480   coding = &coding_categories[coding_category_charset];
5481   attrs = CODING_ID_ATTRS (coding->id);
5482   valids = AREF (attrs, coding_attr_charset_valids);
5483   name = CODING_ID_NAME (coding->id);
5484   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5485                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5486       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5487                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5488     check_latin_extra = 1;
5489
5490   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5491     src += head_ascii;
5492
5493   while (1)
5494     {
5495       int c;
5496       Lisp_Object val;
5497       struct charset *charset;
5498       int dim, idx;
5499
5500       src_base = src;
5501       ONE_MORE_BYTE (c);
5502       if (c < 0)
5503         continue;
5504       val = AREF (valids, c);
5505       if (NILP (val))
5506         break;
5507       if (c >= 0x80)
5508         {
5509           if (c < 0xA0
5510               && check_latin_extra
5511               && (!VECTORP (Vlatin_extra_code_table)
5512                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5513             break;
5514           found = CATEGORY_MASK_CHARSET;
5515         }
5516       if (INTEGERP (val))
5517         {
5518           charset = CHARSET_FROM_ID (XFASTINT (val));
5519           dim = CHARSET_DIMENSION (charset);
5520           for (idx = 1; idx < dim; idx++)
5521             {
5522               if (src == src_end)
5523                 goto too_short;
5524               ONE_MORE_BYTE (c);
5525               if (c < charset->code_space[(dim - 1 - idx) * 2]
5526                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5527                 break;
5528             }
5529           if (idx < dim)
5530             break;
5531         }
5532       else
5533         {
5534           idx = 1;
5535           for (; CONSP (val); val = XCDR (val))
5536             {
5537               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5538               dim = CHARSET_DIMENSION (charset);
5539               while (idx < dim)
5540                 {
5541                   if (src == src_end)
5542                     goto too_short;
5543                   ONE_MORE_BYTE (c);
5544                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5545                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5546                     break;
5547                   idx++;
5548                 }
5549               if (idx == dim)
5550                 {
5551                   val = Qnil;
5552                   break;
5553                 }
5554             }
5555           if (CONSP (val))
5556             break;
5557         }
5558     }
5559  too_short:
5560   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5561   return 0;
5562
5563  no_more_source:
5564   detect_info->found |= found;
5565   return 1;
5566 }
5567
5568 static void
5569 decode_coding_charset (coding)
5570      struct coding_system *coding;
5571 {
5572   const unsigned char *src = coding->source + coding->consumed;
5573   const unsigned char *src_end = coding->source + coding->src_bytes;
5574   const unsigned char *src_base;
5575   int *charbuf = coding->charbuf + coding->charbuf_used;
5576   /* We may produce one charset annocation in one loop and one more at
5577      the end.  */
5578   int *charbuf_end
5579     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5580   int consumed_chars = 0, consumed_chars_base;
5581   int multibytep = coding->src_multibyte;
5582   Lisp_Object attrs, charset_list, valids;
5583   int char_offset = coding->produced_char;
5584   int last_offset = char_offset;
5585   int last_id = charset_ascii;
5586   int eol_crlf =
5587     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5588   int byte_after_cr = -1;
5589
5590   CODING_GET_INFO (coding, attrs, charset_list);
5591   valids = AREF (attrs, coding_attr_charset_valids);
5592
5593   while (1)
5594     {
5595       int c;
5596       Lisp_Object val;
5597       struct charset *charset;
5598       int dim;
5599       int len = 1;
5600       unsigned code;
5601
5602       src_base = src;
5603       consumed_chars_base = consumed_chars;
5604
5605       if (charbuf >= charbuf_end)
5606         {
5607           if (byte_after_cr >= 0)
5608             src_base--;
5609           break;
5610         }
5611
5612       if (byte_after_cr >= 0)
5613         {
5614           c = byte_after_cr;
5615           byte_after_cr = -1;
5616         }
5617       else
5618         {
5619           ONE_MORE_BYTE (c);
5620           if (eol_crlf && c == '\r')
5621             ONE_MORE_BYTE (byte_after_cr);
5622         }
5623       if (c < 0)
5624         goto invalid_code;
5625       code = c;
5626
5627       val = AREF (valids, c);
5628       if (! INTEGERP (val) && ! CONSP (val))
5629         goto invalid_code;
5630       if (INTEGERP (val))
5631         {
5632           charset = CHARSET_FROM_ID (XFASTINT (val));
5633           dim = CHARSET_DIMENSION (charset);
5634           while (len < dim)
5635             {
5636               ONE_MORE_BYTE (c);
5637               code = (code << 8) | c;
5638               len++;
5639             }
5640           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5641                               charset, code, c);
5642         }
5643       else
5644         {
5645           /* VAL is a list of charset IDs.  It is assured that the
5646              list is sorted by charset dimensions (smaller one
5647              comes first).  */
5648           while (CONSP (val))
5649             {
5650               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5651               dim = CHARSET_DIMENSION (charset);
5652               while (len < dim)
5653                 {
5654                   ONE_MORE_BYTE (c);
5655                   code = (code << 8) | c;
5656                   len++;
5657                 }
5658               CODING_DECODE_CHAR (coding, src, src_base,
5659                                   src_end, charset, code, c);
5660               if (c >= 0)
5661                 break;
5662               val = XCDR (val);
5663             }
5664         }
5665       if (c < 0)
5666         goto invalid_code;
5667       if (charset->id != charset_ascii
5668           && last_id != charset->id)
5669         {
5670           if (last_id != charset_ascii)
5671             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5672           last_id = charset->id;
5673           last_offset = char_offset;
5674         }
5675
5676       *charbuf++ = c;
5677       char_offset++;
5678       continue;
5679
5680     invalid_code:
5681       src = src_base;
5682       consumed_chars = consumed_chars_base;
5683       ONE_MORE_BYTE (c);
5684       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5685       char_offset++;
5686       coding->errors++;
5687     }
5688
5689  no_more_source:
5690   if (last_id != charset_ascii)
5691     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5692   coding->consumed_char += consumed_chars_base;
5693   coding->consumed = src_base - coding->source;
5694   coding->charbuf_used = charbuf - coding->charbuf;
5695 }
5696
5697 static int
5698 encode_coding_charset (coding)
5699      struct coding_system *coding;
5700 {
5701   int multibytep = coding->dst_multibyte;
5702   int *charbuf = coding->charbuf;
5703   int *charbuf_end = charbuf + coding->charbuf_used;
5704   unsigned char *dst = coding->destination + coding->produced;
5705   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5706   int safe_room = MAX_MULTIBYTE_LENGTH;
5707   int produced_chars = 0;
5708   Lisp_Object attrs, charset_list;
5709   int ascii_compatible;
5710   int c;
5711
5712   CODING_GET_INFO (coding, attrs, charset_list);
5713   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5714
5715   while (charbuf < charbuf_end)
5716     {
5717       struct charset *charset;
5718       unsigned code;
5719
5720       ASSURE_DESTINATION (safe_room);
5721       c = *charbuf++;
5722       if (ascii_compatible && ASCII_CHAR_P (c))
5723         EMIT_ONE_ASCII_BYTE (c);
5724       else if (CHAR_BYTE8_P (c))
5725         {
5726           c = CHAR_TO_BYTE8 (c);
5727           EMIT_ONE_BYTE (c);
5728         }
5729       else
5730         {
5731           charset = char_charset (c, charset_list, &code);
5732           if (charset)
5733             {
5734               if (CHARSET_DIMENSION (charset) == 1)
5735                 EMIT_ONE_BYTE (code);
5736               else if (CHARSET_DIMENSION (charset) == 2)
5737                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5738               else if (CHARSET_DIMENSION (charset) == 3)
5739                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5740               else
5741                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5742                                  (code >> 8) & 0xFF, code & 0xFF);
5743             }
5744           else
5745             {
5746               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5747                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5748               else
5749                 c = coding->default_char;
5750               EMIT_ONE_BYTE (c);
5751             }
5752         }
5753     }
5754
5755   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5756   coding->produced_char += produced_chars;
5757   coding->produced = dst - coding->destination;
5758   return 0;
5759 }
5760
5761 \f
5762 /*** 7. C library functions ***/
5763
5764 /* Setup coding context CODING from information about CODING_SYSTEM.
5765    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5766    CODING_SYSTEM is invalid, signal an error.  */
5767
5768 void
5769 setup_coding_system (coding_system, coding)
5770      Lisp_Object coding_system;
5771      struct coding_system *coding;
5772 {
5773   Lisp_Object attrs;
5774   Lisp_Object eol_type;
5775   Lisp_Object coding_type;
5776   Lisp_Object val;
5777
5778   if (NILP (coding_system))
5779     coding_system = Qundecided;
5780
5781   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5782
5783   attrs = CODING_ID_ATTRS (coding->id);
5784   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5785
5786   coding->mode = 0;
5787   coding->head_ascii = -1;
5788   if (VECTORP (eol_type))
5789     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5790                             | CODING_REQUIRE_DETECTION_MASK);
5791   else if (! EQ (eol_type, Qunix))
5792     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5793                             | CODING_REQUIRE_ENCODING_MASK);
5794   else
5795     coding->common_flags = 0;
5796   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5797     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5798   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5799     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5800   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5801     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5802
5803   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5804   coding->max_charset_id = SCHARS (val) - 1;
5805   coding->safe_charsets = SDATA (val);
5806   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5807   coding->carryover_bytes = 0;
5808
5809   coding_type = CODING_ATTR_TYPE (attrs);
5810   if (EQ (coding_type, Qundecided))
5811     {
5812       coding->detector = NULL;
5813       coding->decoder = decode_coding_raw_text;
5814       coding->encoder = encode_coding_raw_text;
5815       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5816     }
5817   else if (EQ (coding_type, Qiso_2022))
5818     {
5819       int i;
5820       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5821
5822       /* Invoke graphic register 0 to plane 0.  */
5823       CODING_ISO_INVOCATION (coding, 0) = 0;
5824       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5825       CODING_ISO_INVOCATION (coding, 1)
5826         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5827       /* Setup the initial status of designation.  */
5828       for (i = 0; i < 4; i++)
5829         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5830       /* Not single shifting initially.  */
5831       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5832       /* Beginning of buffer should also be regarded as bol. */
5833       CODING_ISO_BOL (coding) = 1;
5834       coding->detector = detect_coding_iso_2022;
5835       coding->decoder = decode_coding_iso_2022;
5836       coding->encoder = encode_coding_iso_2022;
5837       if (flags & CODING_ISO_FLAG_SAFE)
5838         coding->mode |= CODING_MODE_SAFE_ENCODING;
5839       coding->common_flags
5840         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5841             | CODING_REQUIRE_FLUSHING_MASK);
5842       if (flags & CODING_ISO_FLAG_COMPOSITION)
5843         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5844       if (flags & CODING_ISO_FLAG_DESIGNATION)
5845         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5846       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5847         {
5848           setup_iso_safe_charsets (attrs);
5849           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5850           coding->max_charset_id = SCHARS (val) - 1;
5851           coding->safe_charsets = SDATA (val);
5852         }
5853       CODING_ISO_FLAGS (coding) = flags;
5854       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5855       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5856       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5857       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5858     }
5859   else if (EQ (coding_type, Qcharset))
5860     {
5861       coding->detector = detect_coding_charset;
5862       coding->decoder = decode_coding_charset;
5863       coding->encoder = encode_coding_charset;
5864       coding->common_flags
5865         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5866     }
5867   else if (EQ (coding_type, Qutf_8))
5868     {
5869       val = AREF (attrs, coding_attr_utf_bom);
5870       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5871                                    : EQ (val, Qt) ? utf_with_bom
5872                                    : utf_without_bom);
5873       coding->detector = detect_coding_utf_8;
5874       coding->decoder = decode_coding_utf_8;
5875       coding->encoder = encode_coding_utf_8;
5876       coding->common_flags
5877         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5878       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5879         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5880     }
5881   else if (EQ (coding_type, Qutf_16))
5882     {
5883       val = AREF (attrs, coding_attr_utf_bom);
5884       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5885                                     : EQ (val, Qt) ? utf_with_bom
5886                                     : utf_without_bom);
5887       val = AREF (attrs, coding_attr_utf_16_endian);
5888       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5889                                        : utf_16_little_endian);
5890       CODING_UTF_16_SURROGATE (coding) = 0;
5891       coding->detector = detect_coding_utf_16;
5892       coding->decoder = decode_coding_utf_16;
5893       coding->encoder = encode_coding_utf_16;
5894       coding->common_flags
5895         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5896       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5897         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5898     }
5899   else if (EQ (coding_type, Qccl))
5900     {
5901       coding->detector = detect_coding_ccl;
5902       coding->decoder = decode_coding_ccl;
5903       coding->encoder = encode_coding_ccl;
5904       coding->common_flags
5905         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5906             | CODING_REQUIRE_FLUSHING_MASK);
5907     }
5908   else if (EQ (coding_type, Qemacs_mule))
5909     {
5910       coding->detector = detect_coding_emacs_mule;
5911       coding->decoder = decode_coding_emacs_mule;
5912       coding->encoder = encode_coding_emacs_mule;
5913       coding->common_flags
5914         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5915       coding->spec.emacs_mule.full_support = 1;
5916       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5917           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5918         {
5919           Lisp_Object tail, safe_charsets;
5920           int max_charset_id = 0;
5921
5922           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5923                tail = XCDR (tail))
5924             if (max_charset_id < XFASTINT (XCAR (tail)))
5925               max_charset_id = XFASTINT (XCAR (tail));
5926           safe_charsets = make_uninit_string (max_charset_id + 1);
5927           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5928           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5929                tail = XCDR (tail))
5930             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5931           coding->max_charset_id = max_charset_id;
5932           coding->safe_charsets = SDATA (safe_charsets);
5933           coding->spec.emacs_mule.full_support = 1;
5934         }
5935       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5936       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5937     }
5938   else if (EQ (coding_type, Qshift_jis))
5939     {
5940       coding->detector = detect_coding_sjis;
5941       coding->decoder = decode_coding_sjis;
5942       coding->encoder = encode_coding_sjis;
5943       coding->common_flags
5944         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5945     }
5946   else if (EQ (coding_type, Qbig5))
5947     {
5948       coding->detector = detect_coding_big5;
5949       coding->decoder = decode_coding_big5;
5950       coding->encoder = encode_coding_big5;
5951       coding->common_flags
5952         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5953     }
5954   else                          /* EQ (coding_type, Qraw_text) */
5955     {
5956       coding->detector = NULL;
5957       coding->decoder = decode_coding_raw_text;
5958       coding->encoder = encode_coding_raw_text;
5959       if (! EQ (eol_type, Qunix))
5960         {
5961           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5962           if (! VECTORP (eol_type))
5963             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5964         }
5965
5966     }
5967
5968   return;
5969 }
5970
5971 /* Return a list of charsets supported by CODING.  */
5972
5973 Lisp_Object
5974 coding_charset_list (coding)
5975      struct coding_system *coding;
5976 {
5977   Lisp_Object attrs, charset_list;
5978
5979   CODING_GET_INFO (coding, attrs, charset_list);
5980   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5981     {
5982       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5983
5984       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5985         charset_list = Viso_2022_charset_list;
5986     }
5987   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5988     {
5989       charset_list = Vemacs_mule_charset_list;
5990     }
5991   return charset_list;
5992 }
5993
5994
5995 /* Return a list of charsets supported by CODING-SYSTEM.  */
5996
5997 Lisp_Object
5998 coding_system_charset_list (coding_system)
5999      Lisp_Object coding_system;
6000 {
6001   int id;
6002   Lisp_Object attrs, charset_list;
6003
6004   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6005   attrs = CODING_ID_ATTRS (id);
6006
6007   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6008     {
6009       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6010
6011       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6012         charset_list = Viso_2022_charset_list;
6013       else
6014         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6015     }
6016   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6017     {
6018       charset_list = Vemacs_mule_charset_list;
6019     }
6020   else
6021     {
6022       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6023     }
6024   return charset_list;
6025 }
6026
6027
6028 /* Return raw-text or one of its subsidiaries that has the same
6029    eol_type as CODING-SYSTEM.  */
6030
6031 Lisp_Object
6032 raw_text_coding_system (coding_system)
6033      Lisp_Object coding_system;
6034 {
6035   Lisp_Object spec, attrs;
6036   Lisp_Object eol_type, raw_text_eol_type;
6037
6038   if (NILP (coding_system))
6039     return Qraw_text;
6040   spec = CODING_SYSTEM_SPEC (coding_system);
6041   attrs = AREF (spec, 0);
6042
6043   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6044     return coding_system;
6045
6046   eol_type = AREF (spec, 2);
6047   if (VECTORP (eol_type))
6048     return Qraw_text;
6049   spec = CODING_SYSTEM_SPEC (Qraw_text);
6050   raw_text_eol_type = AREF (spec, 2);
6051   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6052           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6053           : AREF (raw_text_eol_type, 2));
6054 }
6055
6056
6057 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6058    does, return one of the subsidiary that has the same eol-spec as
6059    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6060    inherit end-of-line format from the system's setting
6061    (system_eol_type).  */
6062
6063 Lisp_Object
6064 coding_inherit_eol_type (coding_system, parent)
6065      Lisp_Object coding_system, parent;
6066 {
6067   Lisp_Object spec, eol_type;
6068
6069   if (NILP (coding_system))
6070     coding_system = Qraw_text;
6071   spec = CODING_SYSTEM_SPEC (coding_system);
6072   eol_type = AREF (spec, 2);
6073   if (VECTORP (eol_type))
6074     {
6075       Lisp_Object parent_eol_type;
6076
6077       if (! NILP (parent))
6078         {
6079           Lisp_Object parent_spec;
6080
6081           parent_spec = CODING_SYSTEM_SPEC (parent);
6082           parent_eol_type = AREF (parent_spec, 2);
6083         }
6084       else
6085         parent_eol_type = system_eol_type;
6086       if (EQ (parent_eol_type, Qunix))
6087         coding_system = AREF (eol_type, 0);
6088       else if (EQ (parent_eol_type, Qdos))
6089         coding_system = AREF (eol_type, 1);
6090       else if (EQ (parent_eol_type, Qmac))
6091         coding_system = AREF (eol_type, 2);
6092     }
6093   return coding_system;
6094 }
6095
6096 /* Emacs has a mechanism to automatically detect a coding system if it
6097    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6098    it's impossible to distinguish some coding systems accurately
6099    because they use the same range of codes.  So, at first, coding
6100    systems are categorized into 7, those are:
6101
6102    o coding-category-emacs-mule
6103
6104         The category for a coding system which has the same code range
6105         as Emacs' internal format.  Assigned the coding-system (Lisp
6106         symbol) `emacs-mule' by default.
6107
6108    o coding-category-sjis
6109
6110         The category for a coding system which has the same code range
6111         as SJIS.  Assigned the coding-system (Lisp
6112         symbol) `japanese-shift-jis' by default.
6113
6114    o coding-category-iso-7
6115
6116         The category for a coding system which has the same code range
6117         as ISO2022 of 7-bit environment.  This doesn't use any locking
6118         shift and single shift functions.  This can encode/decode all
6119         charsets.  Assigned the coding-system (Lisp symbol)
6120         `iso-2022-7bit' by default.
6121
6122    o coding-category-iso-7-tight
6123
6124         Same as coding-category-iso-7 except that this can
6125         encode/decode only the specified charsets.
6126
6127    o coding-category-iso-8-1
6128
6129         The category for a coding system which has the same code range
6130         as ISO2022 of 8-bit environment and graphic plane 1 used only
6131         for DIMENSION1 charset.  This doesn't use any locking shift
6132         and single shift functions.  Assigned the coding-system (Lisp
6133         symbol) `iso-latin-1' by default.
6134
6135    o coding-category-iso-8-2
6136
6137         The category for a coding system which has the same code range
6138         as ISO2022 of 8-bit environment and graphic plane 1 used only
6139         for DIMENSION2 charset.  This doesn't use any locking shift
6140         and single shift functions.  Assigned the coding-system (Lisp
6141         symbol) `japanese-iso-8bit' by default.
6142
6143    o coding-category-iso-7-else
6144
6145         The category for a coding system which has the same code range
6146         as ISO2022 of 7-bit environemnt but uses locking shift or
6147         single shift functions.  Assigned the coding-system (Lisp
6148         symbol) `iso-2022-7bit-lock' by default.
6149
6150    o coding-category-iso-8-else
6151
6152         The category for a coding system which has the same code range
6153         as ISO2022 of 8-bit environemnt but uses locking shift or
6154         single shift functions.  Assigned the coding-system (Lisp
6155         symbol) `iso-2022-8bit-ss2' by default.
6156
6157    o coding-category-big5
6158
6159         The category for a coding system which has the same code range
6160         as BIG5.  Assigned the coding-system (Lisp symbol)
6161         `cn-big5' by default.
6162
6163    o coding-category-utf-8
6164
6165         The category for a coding system which has the same code range
6166         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6167         symbol) `utf-8' by default.
6168
6169    o coding-category-utf-16-be
6170
6171         The category for a coding system in which a text has an
6172         Unicode signature (cf. Unicode Standard) in the order of BIG
6173         endian at the head.  Assigned the coding-system (Lisp symbol)
6174         `utf-16-be' by default.
6175
6176    o coding-category-utf-16-le
6177
6178         The category for a coding system in which a text has an
6179         Unicode signature (cf. Unicode Standard) in the order of
6180         LITTLE endian at the head.  Assigned the coding-system (Lisp
6181         symbol) `utf-16-le' by default.
6182
6183    o coding-category-ccl
6184
6185         The category for a coding system of which encoder/decoder is
6186         written in CCL programs.  The default value is nil, i.e., no
6187         coding system is assigned.
6188
6189    o coding-category-binary
6190
6191         The category for a coding system not categorized in any of the
6192         above.  Assigned the coding-system (Lisp symbol)
6193         `no-conversion' by default.
6194
6195    Each of them is a Lisp symbol and the value is an actual
6196    `coding-system's (this is also a Lisp symbol) assigned by a user.
6197    What Emacs does actually is to detect a category of coding system.
6198    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6199    decide only one possible category, it selects a category of the
6200    highest priority.  Priorities of categories are also specified by a
6201    user in a Lisp variable `coding-category-list'.
6202
6203 */
6204
6205 #define EOL_SEEN_NONE   0
6206 #define EOL_SEEN_LF     1
6207 #define EOL_SEEN_CR     2
6208 #define EOL_SEEN_CRLF   4
6209
6210 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6211    SOURCE is encoded.  If CATEGORY is one of
6212    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6213    two-byte, else they are encoded by one-byte.
6214
6215    Return one of EOL_SEEN_XXX.  */
6216
6217 #define MAX_EOL_CHECK_COUNT 3
6218
6219 static int
6220 detect_eol (source, src_bytes, category)
6221      const unsigned char *source;
6222      EMACS_INT src_bytes;
6223      enum coding_category category;
6224 {
6225   const unsigned char *src = source, *src_end = src + src_bytes;
6226   unsigned char c;
6227   int total  = 0;
6228   int eol_seen = EOL_SEEN_NONE;
6229
6230   if ((1 << category) & CATEGORY_MASK_UTF_16)
6231     {
6232       int msb, lsb;
6233
6234       msb = category == (coding_category_utf_16_le
6235                          | coding_category_utf_16_le_nosig);
6236       lsb = 1 - msb;
6237
6238       while (src + 1 < src_end)
6239         {
6240           c = src[lsb];
6241           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6242             {
6243               int this_eol;
6244
6245               if (c == '\n')
6246                 this_eol = EOL_SEEN_LF;
6247               else if (src + 3 >= src_end
6248                        || src[msb + 2] != 0
6249                        || src[lsb + 2] != '\n')
6250                 this_eol = EOL_SEEN_CR;
6251               else
6252                 {
6253                   this_eol = EOL_SEEN_CRLF;
6254                   src += 2;
6255                 }
6256
6257               if (eol_seen == EOL_SEEN_NONE)
6258                 /* This is the first end-of-line.  */
6259                 eol_seen = this_eol;
6260               else if (eol_seen != this_eol)
6261                 {
6262                   /* The found type is different from what found before.
6263                      Allow for stray ^M characters in DOS EOL files.  */
6264                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6265                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6266                     eol_seen = EOL_SEEN_CRLF;
6267                   else
6268                     {
6269                       eol_seen = EOL_SEEN_LF;
6270                       break;
6271                     }
6272                 }
6273               if (++total == MAX_EOL_CHECK_COUNT)
6274                 break;
6275             }
6276           src += 2;
6277         }
6278     }
6279   else
6280     {
6281       while (src < src_end)
6282         {
6283           c = *src++;
6284           if (c == '\n' || c == '\r')
6285             {
6286               int this_eol;
6287
6288               if (c == '\n')
6289                 this_eol = EOL_SEEN_LF;
6290               else if (src >= src_end || *src != '\n')
6291                 this_eol = EOL_SEEN_CR;
6292               else
6293                 this_eol = EOL_SEEN_CRLF, src++;
6294
6295               if (eol_seen == EOL_SEEN_NONE)
6296                 /* This is the first end-of-line.  */
6297                 eol_seen = this_eol;
6298               else if (eol_seen != this_eol)
6299                 {
6300                   /* The found type is different from what found before.
6301                      Allow for stray ^M characters in DOS EOL files.  */
6302                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6303                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6304                     eol_seen = EOL_SEEN_CRLF;
6305                   else
6306                     {
6307                       eol_seen = EOL_SEEN_LF;
6308                       break;
6309                     }
6310                 }
6311               if (++total == MAX_EOL_CHECK_COUNT)
6312                 break;
6313             }
6314         }
6315     }
6316   return eol_seen;
6317 }
6318
6319
6320 static Lisp_Object
6321 adjust_coding_eol_type (coding, eol_seen)
6322      struct coding_system *coding;
6323      int eol_seen;
6324 {
6325   Lisp_Object eol_type;
6326
6327   eol_type = CODING_ID_EOL_TYPE (coding->id);
6328   if (eol_seen & EOL_SEEN_LF)
6329     {
6330       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6331       eol_type = Qunix;
6332     }
6333   else if (eol_seen & EOL_SEEN_CRLF)
6334     {
6335       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6336       eol_type = Qdos;
6337     }
6338   else if (eol_seen & EOL_SEEN_CR)
6339     {
6340       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6341       eol_type = Qmac;
6342     }
6343   return eol_type;
6344 }
6345
6346 /* Detect how a text specified in CODING is encoded.  If a coding
6347    system is detected, update fields of CODING by the detected coding
6348    system.  */
6349
6350 void
6351 detect_coding (coding)
6352      struct coding_system *coding;
6353 {
6354   const unsigned char *src, *src_end;
6355   int saved_mode = coding->mode;
6356
6357   coding->consumed = coding->consumed_char = 0;
6358   coding->produced = coding->produced_char = 0;
6359   coding_set_source (coding);
6360
6361   src_end = coding->source + coding->src_bytes;
6362   coding->head_ascii = 0;
6363
6364   /* If we have not yet decided the text encoding type, detect it
6365      now.  */
6366   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6367     {
6368       int c, i;
6369       struct coding_detection_info detect_info;
6370       int null_byte_found = 0, eight_bit_found = 0;
6371
6372       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6373       for (src = coding->source; src < src_end; src++)
6374         {
6375           c = *src;
6376           if (c & 0x80)
6377             {
6378               eight_bit_found = 1;
6379               if (null_byte_found)
6380                 break;
6381             }
6382           else if (c < 0x20)
6383             {
6384               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6385                   && ! inhibit_iso_escape_detection
6386                   && ! detect_info.checked)
6387                 {
6388                   if (detect_coding_iso_2022 (coding, &detect_info))
6389                     {
6390                       /* We have scanned the whole data.  */
6391                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6392                         {
6393                           /* We didn't find an 8-bit code.  We may
6394                              have found a null-byte, but it's very
6395                              rare that a binary file confirm to
6396                              ISO-2022.  */
6397                           src = src_end;
6398                           coding->head_ascii = src - coding->source;
6399                         }
6400                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6401                       break;
6402                     }
6403                 }
6404               else if (! c && !inhibit_null_byte_detection)
6405                 {
6406                   null_byte_found = 1;
6407                   if (eight_bit_found)
6408                     break;
6409                 }
6410               if (! eight_bit_found)
6411                 coding->head_ascii++;
6412             }
6413           else if (! eight_bit_found)
6414             coding->head_ascii++;
6415         }
6416
6417       if (null_byte_found || eight_bit_found
6418           || coding->head_ascii < coding->src_bytes
6419           || detect_info.found)
6420         {
6421           enum coding_category category;
6422           struct coding_system *this;
6423
6424           if (coding->head_ascii == coding->src_bytes)
6425             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6426             for (i = 0; i < coding_category_raw_text; i++)
6427               {
6428                 category = coding_priorities[i];
6429                 this = coding_categories + category;
6430                 if (detect_info.found & (1 << category))
6431                   break;
6432               }
6433           else
6434             {
6435               if (null_byte_found)
6436                 {
6437                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6438                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6439                 }
6440               for (i = 0; i < coding_category_raw_text; i++)
6441                 {
6442                   category = coding_priorities[i];
6443                   this = coding_categories + category;
6444                   if (this->id < 0)
6445                     {
6446                       /* No coding system of this category is defined.  */
6447                       detect_info.rejected |= (1 << category);
6448                     }
6449                   else if (category >= coding_category_raw_text)
6450                     continue;
6451                   else if (detect_info.checked & (1 << category))
6452                     {
6453                       if (detect_info.found & (1 << category))
6454                         break;
6455                     }
6456                   else if ((*(this->detector)) (coding, &detect_info)
6457                            && detect_info.found & (1 << category))
6458                     {
6459                       if (category == coding_category_utf_16_auto)
6460                         {
6461                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6462                             category = coding_category_utf_16_le;
6463                           else
6464                             category = coding_category_utf_16_be;
6465                         }
6466                       break;
6467                     }
6468                 }
6469             }
6470
6471           if (i < coding_category_raw_text)
6472             setup_coding_system (CODING_ID_NAME (this->id), coding);
6473           else if (null_byte_found)
6474             setup_coding_system (Qno_conversion, coding);
6475           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6476                    == CATEGORY_MASK_ANY)
6477             setup_coding_system (Qraw_text, coding);
6478           else if (detect_info.rejected)
6479             for (i = 0; i < coding_category_raw_text; i++)
6480               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6481                 {
6482                   this = coding_categories + coding_priorities[i];
6483                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6484                   break;
6485                 }
6486         }
6487     }
6488   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6489            == coding_category_utf_8_auto)
6490     {
6491       Lisp_Object coding_systems;
6492       struct coding_detection_info detect_info;
6493
6494       coding_systems
6495         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6496       detect_info.found = detect_info.rejected = 0;
6497       coding->head_ascii = 0;
6498       if (CONSP (coding_systems)
6499           && detect_coding_utf_8 (coding, &detect_info))
6500         {
6501           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6502             setup_coding_system (XCAR (coding_systems), coding);
6503           else
6504             setup_coding_system (XCDR (coding_systems), coding);
6505         }
6506     }
6507   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6508            == coding_category_utf_16_auto)
6509     {
6510       Lisp_Object coding_systems;
6511       struct coding_detection_info detect_info;
6512
6513       coding_systems
6514         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6515       detect_info.found = detect_info.rejected = 0;
6516       coding->head_ascii = 0;
6517       if (CONSP (coding_systems)
6518           && detect_coding_utf_16 (coding, &detect_info))
6519         {
6520           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6521             setup_coding_system (XCAR (coding_systems), coding);
6522           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6523             setup_coding_system (XCDR (coding_systems), coding);
6524         }
6525     }
6526   coding->mode = saved_mode;
6527 }
6528
6529
6530 static void
6531 decode_eol (coding)
6532      struct coding_system *coding;
6533 {
6534   Lisp_Object eol_type;
6535   unsigned char *p, *pbeg, *pend;
6536
6537   eol_type = CODING_ID_EOL_TYPE (coding->id);
6538   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6539     return;
6540
6541   if (NILP (coding->dst_object))
6542     pbeg = coding->destination;
6543   else
6544     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6545   pend = pbeg + coding->produced;
6546
6547   if (VECTORP (eol_type))
6548     {
6549       int eol_seen = EOL_SEEN_NONE;
6550
6551       for (p = pbeg; p < pend; p++)
6552         {
6553           if (*p == '\n')
6554             eol_seen |= EOL_SEEN_LF;
6555           else if (*p == '\r')
6556             {
6557               if (p + 1 < pend && *(p + 1) == '\n')
6558                 {
6559                   eol_seen |= EOL_SEEN_CRLF;
6560                   p++;
6561                 }
6562               else
6563                 eol_seen |= EOL_SEEN_CR;
6564             }
6565         }
6566       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6567       if ((eol_seen & EOL_SEEN_CRLF) != 0
6568           && (eol_seen & EOL_SEEN_CR) != 0
6569           && (eol_seen & EOL_SEEN_LF) == 0)
6570         eol_seen = EOL_SEEN_CRLF;
6571       else if (eol_seen != EOL_SEEN_NONE
6572           && eol_seen != EOL_SEEN_LF
6573           && eol_seen != EOL_SEEN_CRLF
6574           && eol_seen != EOL_SEEN_CR)
6575         eol_seen = EOL_SEEN_LF;
6576       if (eol_seen != EOL_SEEN_NONE)
6577         eol_type = adjust_coding_eol_type (coding, eol_seen);
6578     }
6579
6580   if (EQ (eol_type, Qmac))
6581     {
6582       for (p = pbeg; p < pend; p++)
6583         if (*p == '\r')
6584           *p = '\n';
6585     }
6586   else if (EQ (eol_type, Qdos))
6587     {
6588       int n = 0;
6589
6590       if (NILP (coding->dst_object))
6591         {
6592           /* Start deleting '\r' from the tail to minimize the memory
6593              movement.  */
6594           for (p = pend - 2; p >= pbeg; p--)
6595             if (*p == '\r')
6596               {
6597                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6598                 n++;
6599               }
6600         }
6601       else
6602         {
6603           int pos_byte = coding->dst_pos_byte;
6604           int pos = coding->dst_pos;
6605           int pos_end = pos + coding->produced_char - 1;
6606
6607           while (pos < pos_end)
6608             {
6609               p = BYTE_POS_ADDR (pos_byte);
6610               if (*p == '\r' && p[1] == '\n')
6611                 {
6612                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6613                   n++;
6614                   pos_end--;
6615                 }
6616               pos++;
6617               if (coding->dst_multibyte)
6618                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6619               else
6620                 pos_byte++;
6621             }
6622         }
6623       coding->produced -= n;
6624       coding->produced_char -= n;
6625     }
6626 }
6627
6628
6629 /* Return a translation table (or list of them) from coding system
6630    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6631    decoding (ENCODEP is zero). */
6632
6633 static Lisp_Object
6634 get_translation_table (attrs, encodep, max_lookup)
6635      Lisp_Object attrs;
6636      int encodep, *max_lookup;
6637 {
6638   Lisp_Object standard, translation_table;
6639   Lisp_Object val;
6640
6641   if (NILP (Venable_character_translation))
6642     {
6643       if (max_lookup)
6644         *max_lookup = 0;
6645       return Qnil;
6646     }
6647   if (encodep)
6648     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6649       standard = Vstandard_translation_table_for_encode;
6650   else
6651     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6652       standard = Vstandard_translation_table_for_decode;
6653   if (NILP (translation_table))
6654     translation_table = standard;
6655   else
6656     {
6657       if (SYMBOLP (translation_table))
6658         translation_table = Fget (translation_table, Qtranslation_table);
6659       else if (CONSP (translation_table))
6660         {
6661           translation_table = Fcopy_sequence (translation_table);
6662           for (val = translation_table; CONSP (val); val = XCDR (val))
6663             if (SYMBOLP (XCAR (val)))
6664               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6665         }
6666       if (CHAR_TABLE_P (standard))
6667         {
6668           if (CONSP (translation_table))
6669             translation_table = nconc2 (translation_table,
6670                                         Fcons (standard, Qnil));
6671           else
6672             translation_table = Fcons (translation_table,
6673                                        Fcons (standard, Qnil));
6674         }
6675     }
6676
6677   if (max_lookup)
6678     {
6679       *max_lookup = 1;
6680       if (CHAR_TABLE_P (translation_table)
6681           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6682         {
6683           val = XCHAR_TABLE (translation_table)->extras[1];
6684           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6685             *max_lookup = XFASTINT (val);
6686         }
6687       else if (CONSP (translation_table))
6688         {
6689           Lisp_Object tail, val;
6690
6691           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6692             if (CHAR_TABLE_P (XCAR (tail))
6693                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6694               {
6695                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6696                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6697                   *max_lookup = XFASTINT (val);
6698               }
6699         }
6700     }
6701   return translation_table;
6702 }
6703
6704 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6705   do {                                                          \
6706     trans = Qnil;                                               \
6707     if (CHAR_TABLE_P (table))                                   \
6708       {                                                         \
6709         trans = CHAR_TABLE_REF (table, c);                      \
6710         if (CHARACTERP (trans))                                 \
6711           c = XFASTINT (trans), trans = Qnil;                   \
6712       }                                                         \
6713     else if (CONSP (table))                                     \
6714       {                                                         \
6715         Lisp_Object tail;                                       \
6716                                                                 \
6717         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6718           if (CHAR_TABLE_P (XCAR (tail)))                       \
6719             {                                                   \
6720               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6721               if (CHARACTERP (trans))                           \
6722                 c = XFASTINT (trans), trans = Qnil;             \
6723               else if (! NILP (trans))                          \
6724                 break;                                          \
6725             }                                                   \
6726       }                                                         \
6727   } while (0)
6728
6729
6730 /* Return a translation of character(s) at BUF according to TRANS.
6731    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6732    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6733    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6734    translation is found, and Qnil if not found..
6735    If BUF is too short to lookup characters in FROM, return Qt.  */
6736
6737 static Lisp_Object
6738 get_translation (trans, buf, buf_end)
6739      Lisp_Object trans;
6740      int *buf, *buf_end;
6741 {
6742
6743   if (INTEGERP (trans))
6744     return trans;
6745   for (; CONSP (trans); trans = XCDR (trans))
6746     {
6747       Lisp_Object val = XCAR (trans);
6748       Lisp_Object from = XCAR (val);
6749       int len = ASIZE (from);
6750       int i;
6751
6752       for (i = 0; i < len; i++)
6753         {
6754           if (buf + i == buf_end)
6755             return Qt;
6756           if (XINT (AREF (from, i)) != buf[i])
6757             break;
6758         }
6759       if (i == len)
6760         return val;
6761     }
6762   return Qnil;
6763 }
6764
6765
6766 static int
6767 produce_chars (coding, translation_table, last_block)
6768      struct coding_system *coding;
6769      Lisp_Object translation_table;
6770      int last_block;
6771 {
6772   unsigned char *dst = coding->destination + coding->produced;
6773   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6774   EMACS_INT produced;
6775   EMACS_INT produced_chars = 0;
6776   int carryover = 0;
6777
6778   if (! coding->chars_at_source)
6779     {
6780       /* Source characters are in coding->charbuf.  */
6781       int *buf = coding->charbuf;
6782       int *buf_end = buf + coding->charbuf_used;
6783
6784       if (EQ (coding->src_object, coding->dst_object))
6785         {
6786           coding_set_source (coding);
6787           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6788         }
6789
6790       while (buf < buf_end)
6791         {
6792           int c = *buf, i;
6793
6794           if (c >= 0)
6795             {
6796               int from_nchars = 1, to_nchars = 1;
6797               Lisp_Object trans = Qnil;
6798
6799               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6800               if (! NILP (trans))
6801                 {
6802                   trans = get_translation (trans, buf, buf_end);
6803                   if (INTEGERP (trans))
6804                     c = XINT (trans);
6805                   else if (CONSP (trans))
6806                     {
6807                       from_nchars = ASIZE (XCAR (trans));
6808                       trans = XCDR (trans);
6809                       if (INTEGERP (trans))
6810                         c = XINT (trans);
6811                       else
6812                         {
6813                           to_nchars = ASIZE (trans);
6814                           c = XINT (AREF (trans, 0));
6815                         }
6816                     }
6817                   else if (EQ (trans, Qt) && ! last_block)
6818                     break;
6819                 }
6820
6821               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6822                 {
6823                   dst = alloc_destination (coding,
6824                                            buf_end - buf
6825                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6826                                            dst);
6827                   if (EQ (coding->src_object, coding->dst_object))
6828                     {
6829                       coding_set_source (coding);
6830                       dst_end = (((unsigned char *) coding->source)
6831                                  + coding->consumed);
6832                     }
6833                   else
6834                     dst_end = coding->destination + coding->dst_bytes;
6835                 }
6836
6837               for (i = 0; i < to_nchars; i++)
6838                 {
6839                   if (i > 0)
6840                     c = XINT (AREF (trans, i));
6841                   if (coding->dst_multibyte
6842                       || ! CHAR_BYTE8_P (c))
6843                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6844                   else
6845                     *dst++ = CHAR_TO_BYTE8 (c);
6846                 }
6847               produced_chars += to_nchars;
6848               buf += from_nchars;
6849             }
6850           else
6851             /* This is an annotation datum.  (-C) is the length.  */
6852             buf += -c;
6853         }
6854       carryover = buf_end - buf;
6855     }
6856   else
6857     {
6858       /* Source characters are at coding->source.  */
6859       const unsigned char *src = coding->source;
6860       const unsigned char *src_end = src + coding->consumed;
6861
6862       if (EQ (coding->dst_object, coding->src_object))
6863         dst_end = (unsigned char *) src;
6864       if (coding->src_multibyte != coding->dst_multibyte)
6865         {
6866           if (coding->src_multibyte)
6867             {
6868               int multibytep = 1;
6869               EMACS_INT consumed_chars = 0;
6870
6871               while (1)
6872                 {
6873                   const unsigned char *src_base = src;
6874                   int c;
6875
6876                   ONE_MORE_BYTE (c);
6877                   if (dst == dst_end)
6878                     {
6879                       if (EQ (coding->src_object, coding->dst_object))
6880                         dst_end = (unsigned char *) src;
6881                       if (dst == dst_end)
6882                         {
6883                           EMACS_INT offset = src - coding->source;
6884
6885                           dst = alloc_destination (coding, src_end - src + 1,
6886                                                    dst);
6887                           dst_end = coding->destination + coding->dst_bytes;
6888                           coding_set_source (coding);
6889                           src = coding->source + offset;
6890                           src_end = coding->source + coding->src_bytes;
6891                           if (EQ (coding->src_object, coding->dst_object))
6892                             dst_end = (unsigned char *) src;
6893                         }
6894                     }
6895                   *dst++ = c;
6896                   produced_chars++;
6897                 }
6898             no_more_source:
6899               ;
6900             }
6901           else
6902             while (src < src_end)
6903               {
6904                 int multibytep = 1;
6905                 int c = *src++;
6906
6907                 if (dst >= dst_end - 1)
6908                   {
6909                     if (EQ (coding->src_object, coding->dst_object))
6910                       dst_end = (unsigned char *) src;
6911                     if (dst >= dst_end - 1)
6912                       {
6913                         EMACS_INT offset = src - coding->source;
6914                         EMACS_INT more_bytes;
6915
6916                         if (EQ (coding->src_object, coding->dst_object))
6917                           more_bytes = ((src_end - src) / 2) + 2;
6918                         else
6919                           more_bytes = src_end - src + 2;
6920                         dst = alloc_destination (coding, more_bytes, dst);
6921                         dst_end = coding->destination + coding->dst_bytes;
6922                         coding_set_source (coding);
6923                         src = coding->source + offset;
6924                         src_end = coding->source + coding->src_bytes;
6925                         if (EQ (coding->src_object, coding->dst_object))
6926                           dst_end = (unsigned char *) src;
6927                       }
6928                   }
6929                 EMIT_ONE_BYTE (c);
6930               }
6931         }
6932       else
6933         {
6934           if (!EQ (coding->src_object, coding->dst_object))
6935             {
6936               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6937
6938               if (require > 0)
6939                 {
6940                   EMACS_INT offset = src - coding->source;
6941
6942                   dst = alloc_destination (coding, require, dst);
6943                   coding_set_source (coding);
6944                   src = coding->source + offset;
6945                   src_end = coding->source + coding->src_bytes;
6946                 }
6947             }
6948           produced_chars = coding->consumed_char;
6949           while (src < src_end)
6950             *dst++ = *src++;
6951         }
6952     }
6953
6954   produced = dst - (coding->destination + coding->produced);
6955   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6956     insert_from_gap (produced_chars, produced);
6957   coding->produced += produced;
6958   coding->produced_char += produced_chars;
6959   return carryover;
6960 }
6961
6962 /* Compose text in CODING->object according to the annotation data at
6963    CHARBUF.  CHARBUF is an array:
6964      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6965  */
6966
6967 static INLINE void
6968 produce_composition (coding, charbuf, pos)
6969      struct coding_system *coding;
6970      int *charbuf;
6971      EMACS_INT pos;
6972 {
6973   int len;
6974   EMACS_INT to;
6975   enum composition_method method;
6976   Lisp_Object components;
6977
6978   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6979   to = pos + charbuf[2];
6980   method = (enum composition_method) (charbuf[4]);
6981
6982   if (method == COMPOSITION_RELATIVE)
6983     components = Qnil;
6984   else
6985     {
6986       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6987       int i, j;
6988
6989       if (method == COMPOSITION_WITH_RULE)
6990         len = charbuf[2] * 3 - 2;
6991       charbuf += MAX_ANNOTATION_LENGTH;
6992       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6993       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6994         {
6995           if (charbuf[i] >= 0)
6996             args[j] = make_number (charbuf[i]);
6997           else
6998             {
6999               i++;
7000               args[j] = make_number (charbuf[i] % 0x100);
7001             }
7002         }
7003       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7004     }
7005   compose_text (pos, to, components, Qnil, coding->dst_object);
7006 }
7007
7008
7009 /* Put `charset' property on text in CODING->object according to
7010    the annotation data at CHARBUF.  CHARBUF is an array:
7011      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7012  */
7013
7014 static INLINE void
7015 produce_charset (coding, charbuf, pos)
7016      struct coding_system *coding;
7017      int *charbuf;
7018      EMACS_INT pos;
7019 {
7020   EMACS_INT from = pos - charbuf[2];
7021   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7022
7023   Fput_text_property (make_number (from), make_number (pos),
7024                       Qcharset, CHARSET_NAME (charset),
7025                       coding->dst_object);
7026 }
7027
7028
7029 #define CHARBUF_SIZE 0x4000
7030
7031 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7032   do {                                                                  \
7033     int size = CHARBUF_SIZE;                                            \
7034                                                                         \
7035     coding->charbuf = NULL;                                             \
7036     while (size > 1024)                                                 \
7037       {                                                                 \
7038         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7039         if (coding->charbuf)                                            \
7040           break;                                                        \
7041         size >>= 1;                                                     \
7042       }                                                                 \
7043     if (! coding->charbuf)                                              \
7044       {                                                                 \
7045         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7046         return coding->result;                                          \
7047       }                                                                 \
7048     coding->charbuf_size = size;                                        \
7049   } while (0)
7050
7051
7052 static void
7053 produce_annotation (coding, pos)
7054      struct coding_system *coding;
7055      EMACS_INT pos;
7056 {
7057   int *charbuf = coding->charbuf;
7058   int *charbuf_end = charbuf + coding->charbuf_used;
7059
7060   if (NILP (coding->dst_object))
7061     return;
7062
7063   while (charbuf < charbuf_end)
7064     {
7065       if (*charbuf >= 0)
7066         pos++, charbuf++;
7067       else
7068         {
7069           int len = -*charbuf;
7070
7071           if (len > 2)
7072             switch (charbuf[1])
7073               {
7074               case CODING_ANNOTATE_COMPOSITION_MASK:
7075                 produce_composition (coding, charbuf, pos);
7076                 break;
7077               case CODING_ANNOTATE_CHARSET_MASK:
7078                 produce_charset (coding, charbuf, pos);
7079                 break;
7080               }
7081           charbuf += len;
7082         }
7083     }
7084 }
7085
7086 /* Decode the data at CODING->src_object into CODING->dst_object.
7087    CODING->src_object is a buffer, a string, or nil.
7088    CODING->dst_object is a buffer.
7089
7090    If CODING->src_object is a buffer, it must be the current buffer.
7091    In this case, if CODING->src_pos is positive, it is a position of
7092    the source text in the buffer, otherwise, the source text is in the
7093    gap area of the buffer, and CODING->src_pos specifies the offset of
7094    the text from GPT (which must be the same as PT).  If this is the
7095    same buffer as CODING->dst_object, CODING->src_pos must be
7096    negative.
7097
7098    If CODING->src_object is a string, CODING->src_pos is an index to
7099    that string.
7100
7101    If CODING->src_object is nil, CODING->source must already point to
7102    the non-relocatable memory area.  In this case, CODING->src_pos is
7103    an offset from CODING->source.
7104
7105    The decoded data is inserted at the current point of the buffer
7106    CODING->dst_object.
7107 */
7108
7109 static int
7110 decode_coding (coding)
7111      struct coding_system *coding;
7112 {
7113   Lisp_Object attrs;
7114   Lisp_Object undo_list;
7115   Lisp_Object translation_table;
7116   int carryover;
7117   int i;
7118
7119   if (BUFFERP (coding->src_object)
7120       && coding->src_pos > 0
7121       && coding->src_pos < GPT
7122       && coding->src_pos + coding->src_chars > GPT)
7123     move_gap_both (coding->src_pos, coding->src_pos_byte);
7124
7125   undo_list = Qt;
7126   if (BUFFERP (coding->dst_object))
7127     {
7128       if (current_buffer != XBUFFER (coding->dst_object))
7129         set_buffer_internal (XBUFFER (coding->dst_object));
7130       if (GPT != PT)
7131         move_gap_both (PT, PT_BYTE);
7132       undo_list = current_buffer->undo_list;
7133       current_buffer->undo_list = Qt;
7134     }
7135
7136   coding->consumed = coding->consumed_char = 0;
7137   coding->produced = coding->produced_char = 0;
7138   coding->chars_at_source = 0;
7139   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7140   coding->errors = 0;
7141
7142   ALLOC_CONVERSION_WORK_AREA (coding);
7143
7144   attrs = CODING_ID_ATTRS (coding->id);
7145   translation_table = get_translation_table (attrs, 0, NULL);
7146
7147   carryover = 0;
7148   do
7149     {
7150       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7151
7152       coding_set_source (coding);
7153       coding->annotated = 0;
7154       coding->charbuf_used = carryover;
7155       (*(coding->decoder)) (coding);
7156       coding_set_destination (coding);
7157       carryover = produce_chars (coding, translation_table, 0);
7158       if (coding->annotated)
7159         produce_annotation (coding, pos);
7160       for (i = 0; i < carryover; i++)
7161         coding->charbuf[i]
7162           = coding->charbuf[coding->charbuf_used - carryover + i];
7163     }
7164   while (coding->consumed < coding->src_bytes
7165          && (coding->result == CODING_RESULT_SUCCESS
7166              || coding->result == CODING_RESULT_INVALID_SRC));
7167
7168   if (carryover > 0)
7169     {
7170       coding_set_destination (coding);
7171       coding->charbuf_used = carryover;
7172       produce_chars (coding, translation_table, 1);
7173     }
7174
7175   coding->carryover_bytes = 0;
7176   if (coding->consumed < coding->src_bytes)
7177     {
7178       int nbytes = coding->src_bytes - coding->consumed;
7179       const unsigned char *src;
7180
7181       coding_set_source (coding);
7182       coding_set_destination (coding);
7183       src = coding->source + coding->consumed;
7184
7185       if (coding->mode & CODING_MODE_LAST_BLOCK)
7186         {
7187           /* Flush out unprocessed data as binary chars.  We are sure
7188              that the number of data is less than the size of
7189              coding->charbuf.  */
7190           coding->charbuf_used = 0;
7191           coding->chars_at_source = 0;
7192
7193           while (nbytes-- > 0)
7194             {
7195               int c = *src++;
7196
7197               if (c & 0x80)
7198                 c = BYTE8_TO_CHAR (c);
7199               coding->charbuf[coding->charbuf_used++] = c;
7200             }
7201           produce_chars (coding, Qnil, 1);
7202         }
7203       else
7204         {
7205           /* Record unprocessed bytes in coding->carryover.  We are
7206              sure that the number of data is less than the size of
7207              coding->carryover.  */
7208           unsigned char *p = coding->carryover;
7209
7210           if (nbytes > sizeof coding->carryover)
7211             nbytes = sizeof coding->carryover;
7212           coding->carryover_bytes = nbytes;
7213           while (nbytes-- > 0)
7214             *p++ = *src++;
7215         }
7216       coding->consumed = coding->src_bytes;
7217     }
7218
7219   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7220       && !inhibit_eol_conversion)
7221     decode_eol (coding);
7222   if (BUFFERP (coding->dst_object))
7223     {
7224       current_buffer->undo_list = undo_list;
7225       record_insert (coding->dst_pos, coding->produced_char);
7226     }
7227   return coding->result;
7228 }
7229
7230
7231 /* Extract an annotation datum from a composition starting at POS and
7232    ending before LIMIT of CODING->src_object (buffer or string), store
7233    the data in BUF, set *STOP to a starting position of the next
7234    composition (if any) or to LIMIT, and return the address of the
7235    next element of BUF.
7236
7237    If such an annotation is not found, set *STOP to a starting
7238    position of a composition after POS (if any) or to LIMIT, and
7239    return BUF.  */
7240
7241 static INLINE int *
7242 handle_composition_annotation (pos, limit, coding, buf, stop)
7243      EMACS_INT pos, limit;
7244      struct coding_system *coding;
7245      int *buf;
7246      EMACS_INT *stop;
7247 {
7248   EMACS_INT start, end;
7249   Lisp_Object prop;
7250
7251   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7252       || end > limit)
7253     *stop = limit;
7254   else if (start > pos)
7255     *stop = start;
7256   else
7257     {
7258       if (start == pos)
7259         {
7260           /* We found a composition.  Store the corresponding
7261              annotation data in BUF.  */
7262           int *head = buf;
7263           enum composition_method method = COMPOSITION_METHOD (prop);
7264           int nchars = COMPOSITION_LENGTH (prop);
7265
7266           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7267           if (method != COMPOSITION_RELATIVE)
7268             {
7269               Lisp_Object components;
7270               int len, i, i_byte;
7271
7272               components = COMPOSITION_COMPONENTS (prop);
7273               if (VECTORP (components))
7274                 {
7275                   len = XVECTOR (components)->size;
7276                   for (i = 0; i < len; i++)
7277                     *buf++ = XINT (AREF (components, i));
7278                 }
7279               else if (STRINGP (components))
7280                 {
7281                   len = SCHARS (components);
7282                   i = i_byte = 0;
7283                   while (i < len)
7284                     {
7285                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7286                       buf++;
7287                     }
7288                 }
7289               else if (INTEGERP (components))
7290                 {
7291                   len = 1;
7292                   *buf++ = XINT (components);
7293                 }
7294               else if (CONSP (components))
7295                 {
7296                   for (len = 0; CONSP (components);
7297                        len++, components = XCDR (components))
7298                     *buf++ = XINT (XCAR (components));
7299                 }
7300               else
7301                 abort ();
7302               *head -= len;
7303             }
7304         }
7305
7306       if (find_composition (end, limit, &start, &end, &prop,
7307                             coding->src_object)
7308           && end <= limit)
7309         *stop = start;
7310       else
7311         *stop = limit;
7312     }
7313   return buf;
7314 }
7315
7316
7317 /* Extract an annotation datum from a text property `charset' at POS of
7318    CODING->src_object (buffer of string), store the data in BUF, set
7319    *STOP to the position where the value of `charset' property changes
7320    (limiting by LIMIT), and return the address of the next element of
7321    BUF.
7322
7323    If the property value is nil, set *STOP to the position where the
7324    property value is non-nil (limiting by LIMIT), and return BUF.  */
7325
7326 static INLINE int *
7327 handle_charset_annotation (pos, limit, coding, buf, stop)
7328      EMACS_INT pos, limit;
7329      struct coding_system *coding;
7330      int *buf;
7331      EMACS_INT *stop;
7332 {
7333   Lisp_Object val, next;
7334   int id;
7335
7336   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7337   if (! NILP (val) && CHARSETP (val))
7338     id = XINT (CHARSET_SYMBOL_ID (val));
7339   else
7340     id = -1;
7341   ADD_CHARSET_DATA (buf, 0, id);
7342   next = Fnext_single_property_change (make_number (pos), Qcharset,
7343                                        coding->src_object,
7344                                        make_number (limit));
7345   *stop = XINT (next);
7346   return buf;
7347 }
7348
7349
7350 static void
7351 consume_chars (coding, translation_table, max_lookup)
7352      struct coding_system *coding;
7353      Lisp_Object translation_table;
7354      int max_lookup;
7355 {
7356   int *buf = coding->charbuf;
7357   int *buf_end = coding->charbuf + coding->charbuf_size;
7358   const unsigned char *src = coding->source + coding->consumed;
7359   const unsigned char *src_end = coding->source + coding->src_bytes;
7360   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7361   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7362   int multibytep = coding->src_multibyte;
7363   Lisp_Object eol_type;
7364   int c;
7365   EMACS_INT stop, stop_composition, stop_charset;
7366   int *lookup_buf = NULL;
7367
7368   if (! NILP (translation_table))
7369     lookup_buf = alloca (sizeof (int) * max_lookup);
7370
7371   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7372   if (VECTORP (eol_type))
7373     eol_type = Qunix;
7374
7375   /* Note: composition handling is not yet implemented.  */
7376   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7377
7378   if (NILP (coding->src_object))
7379     stop = stop_composition = stop_charset = end_pos;
7380   else
7381     {
7382       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7383         stop = stop_composition = pos;
7384       else
7385         stop = stop_composition = end_pos;
7386       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7387         stop = stop_charset = pos;
7388       else
7389         stop_charset = end_pos;
7390     }
7391
7392   /* Compensate for CRLF and conversion.  */
7393   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7394   while (buf < buf_end)
7395     {
7396       Lisp_Object trans;
7397
7398       if (pos == stop)
7399         {
7400           if (pos == end_pos)
7401             break;
7402           if (pos == stop_composition)
7403             buf = handle_composition_annotation (pos, end_pos, coding,
7404                                                  buf, &stop_composition);
7405           if (pos == stop_charset)
7406             buf = handle_charset_annotation (pos, end_pos, coding,
7407                                              buf, &stop_charset);
7408           stop = (stop_composition < stop_charset
7409                   ? stop_composition : stop_charset);
7410         }
7411
7412       if (! multibytep)
7413         {
7414           EMACS_INT bytes;
7415
7416           if (coding->encoder == encode_coding_raw_text)
7417             c = *src++, pos++;
7418           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7419             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7420           else
7421             c = BYTE8_TO_CHAR (*src), src++, pos++;
7422         }
7423       else
7424         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7425       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7426         c = '\n';
7427       if (! EQ (eol_type, Qunix))
7428         {
7429           if (c == '\n')
7430             {
7431               if (EQ (eol_type, Qdos))
7432                 *buf++ = '\r';
7433               else
7434                 c = '\r';
7435             }
7436         }
7437
7438       trans = Qnil;
7439       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7440       if (NILP (trans))
7441         *buf++ = c;
7442       else
7443         {
7444           int from_nchars = 1, to_nchars = 1;
7445           int *lookup_buf_end;
7446           const unsigned char *p = src;
7447           int i;
7448
7449           lookup_buf[0] = c;
7450           for (i = 1; i < max_lookup && p < src_end; i++)
7451             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7452           lookup_buf_end = lookup_buf + i;
7453           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7454           if (INTEGERP (trans))
7455             c = XINT (trans);
7456           else if (CONSP (trans))
7457             {
7458               from_nchars = ASIZE (XCAR (trans));
7459               trans = XCDR (trans);
7460               if (INTEGERP (trans))
7461                 c = XINT (trans);
7462               else
7463                 {
7464                   to_nchars = ASIZE (trans);
7465                   if (buf + to_nchars > buf_end)
7466                     break;
7467                   c = XINT (AREF (trans, 0));
7468                 }
7469             }
7470           else
7471             break;
7472           *buf++ = c;
7473           for (i = 1; i < to_nchars; i++)
7474             *buf++ = XINT (AREF (trans, i));
7475           for (i = 1; i < from_nchars; i++, pos++)
7476             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7477         }
7478     }
7479
7480   coding->consumed = src - coding->source;
7481   coding->consumed_char = pos - coding->src_pos;
7482   coding->charbuf_used = buf - coding->charbuf;
7483   coding->chars_at_source = 0;
7484 }
7485
7486
7487 /* Encode the text at CODING->src_object into CODING->dst_object.
7488    CODING->src_object is a buffer or a string.
7489    CODING->dst_object is a buffer or nil.
7490
7491    If CODING->src_object is a buffer, it must be the current buffer.
7492    In this case, if CODING->src_pos is positive, it is a position of
7493    the source text in the buffer, otherwise. the source text is in the
7494    gap area of the buffer, and coding->src_pos specifies the offset of
7495    the text from GPT (which must be the same as PT).  If this is the
7496    same buffer as CODING->dst_object, CODING->src_pos must be
7497    negative and CODING should not have `pre-write-conversion'.
7498
7499    If CODING->src_object is a string, CODING should not have
7500    `pre-write-conversion'.
7501
7502    If CODING->dst_object is a buffer, the encoded data is inserted at
7503    the current point of that buffer.
7504
7505    If CODING->dst_object is nil, the encoded data is placed at the
7506    memory area specified by CODING->destination.  */
7507
7508 static int
7509 encode_coding (coding)
7510      struct coding_system *coding;
7511 {
7512   Lisp_Object attrs;
7513   Lisp_Object translation_table;
7514   int max_lookup;
7515
7516   attrs = CODING_ID_ATTRS (coding->id);
7517   if (coding->encoder == encode_coding_raw_text)
7518     translation_table = Qnil, max_lookup = 0;
7519   else
7520     translation_table = get_translation_table (attrs, 1, &max_lookup);
7521
7522   if (BUFFERP (coding->dst_object))
7523     {
7524       set_buffer_internal (XBUFFER (coding->dst_object));
7525       coding->dst_multibyte
7526         = ! NILP (current_buffer->enable_multibyte_characters);
7527     }
7528
7529   coding->consumed = coding->consumed_char = 0;
7530   coding->produced = coding->produced_char = 0;
7531   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7532   coding->errors = 0;
7533
7534   ALLOC_CONVERSION_WORK_AREA (coding);
7535
7536   do {
7537     coding_set_source (coding);
7538     consume_chars (coding, translation_table, max_lookup);
7539     coding_set_destination (coding);
7540     (*(coding->encoder)) (coding);
7541   } while (coding->consumed_char < coding->src_chars);
7542
7543   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7544     insert_from_gap (coding->produced_char, coding->produced);
7545
7546   return (coding->result);
7547 }
7548
7549
7550 /* Name (or base name) of work buffer for code conversion.  */
7551 static Lisp_Object Vcode_conversion_workbuf_name;
7552
7553 /* A working buffer used by the top level conversion.  Once it is
7554    created, it is never destroyed.  It has the name
7555    Vcode_conversion_workbuf_name.  The other working buffers are
7556    destroyed after the use is finished, and their names are modified
7557    versions of Vcode_conversion_workbuf_name.  */
7558 static Lisp_Object Vcode_conversion_reused_workbuf;
7559
7560 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7561 static int reused_workbuf_in_use;
7562
7563
7564 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7565    multibyteness of returning buffer.  */
7566
7567 static Lisp_Object
7568 make_conversion_work_buffer (multibyte)
7569      int multibyte;
7570 {
7571   Lisp_Object name, workbuf;
7572   struct buffer *current;
7573
7574   if (reused_workbuf_in_use++)
7575     {
7576       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7577       workbuf = Fget_buffer_create (name);
7578     }
7579   else
7580     {
7581       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7582         Vcode_conversion_reused_workbuf
7583           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7584       workbuf = Vcode_conversion_reused_workbuf;
7585     }
7586   current = current_buffer;
7587   set_buffer_internal (XBUFFER (workbuf));
7588   /* We can't allow modification hooks to run in the work buffer.  For
7589      instance, directory_files_internal assumes that file decoding
7590      doesn't compile new regexps.  */
7591   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7592   Ferase_buffer ();
7593   current_buffer->undo_list = Qt;
7594   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7595   set_buffer_internal (current);
7596   return workbuf;
7597 }
7598
7599
7600 static Lisp_Object
7601 code_conversion_restore (arg)
7602      Lisp_Object arg;
7603 {
7604   Lisp_Object current, workbuf;
7605   struct gcpro gcpro1;
7606
7607   GCPRO1 (arg);
7608   current = XCAR (arg);
7609   workbuf = XCDR (arg);
7610   if (! NILP (workbuf))
7611     {
7612       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7613         reused_workbuf_in_use = 0;
7614       else if (! NILP (Fbuffer_live_p (workbuf)))
7615         Fkill_buffer (workbuf);
7616     }
7617   set_buffer_internal (XBUFFER (current));
7618   UNGCPRO;
7619   return Qnil;
7620 }
7621
7622 Lisp_Object
7623 code_conversion_save (with_work_buf, multibyte)
7624      int with_work_buf, multibyte;
7625 {
7626   Lisp_Object workbuf = Qnil;
7627
7628   if (with_work_buf)
7629     workbuf = make_conversion_work_buffer (multibyte);
7630   record_unwind_protect (code_conversion_restore,
7631                          Fcons (Fcurrent_buffer (), workbuf));
7632   return workbuf;
7633 }
7634
7635 int
7636 decode_coding_gap (coding, chars, bytes)
7637      struct coding_system *coding;
7638      EMACS_INT chars, bytes;
7639 {
7640   int count = specpdl_ptr - specpdl;
7641   Lisp_Object attrs;
7642
7643   code_conversion_save (0, 0);
7644
7645   coding->src_object = Fcurrent_buffer ();
7646   coding->src_chars = chars;
7647   coding->src_bytes = bytes;
7648   coding->src_pos = -chars;
7649   coding->src_pos_byte = -bytes;
7650   coding->src_multibyte = chars < bytes;
7651   coding->dst_object = coding->src_object;
7652   coding->dst_pos = PT;
7653   coding->dst_pos_byte = PT_BYTE;
7654   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7655
7656   if (CODING_REQUIRE_DETECTION (coding))
7657     detect_coding (coding);
7658
7659   coding->mode |= CODING_MODE_LAST_BLOCK;
7660   current_buffer->text->inhibit_shrinking = 1;
7661   decode_coding (coding);
7662   current_buffer->text->inhibit_shrinking = 0;
7663
7664   attrs = CODING_ID_ATTRS (coding->id);
7665   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7666     {
7667       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7668       Lisp_Object val;
7669
7670       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7671       val = call1 (CODING_ATTR_POST_READ (attrs),
7672                    make_number (coding->produced_char));
7673       CHECK_NATNUM (val);
7674       coding->produced_char += Z - prev_Z;
7675       coding->produced += Z_BYTE - prev_Z_BYTE;
7676     }
7677
7678   unbind_to (count, Qnil);
7679   return coding->result;
7680 }
7681
7682 int
7683 encode_coding_gap (coding, chars, bytes)
7684      struct coding_system *coding;
7685      EMACS_INT chars, bytes;
7686 {
7687   int count = specpdl_ptr - specpdl;
7688
7689   code_conversion_save (0, 0);
7690
7691   coding->src_object = Fcurrent_buffer ();
7692   coding->src_chars = chars;
7693   coding->src_bytes = bytes;
7694   coding->src_pos = -chars;
7695   coding->src_pos_byte = -bytes;
7696   coding->src_multibyte = chars < bytes;
7697   coding->dst_object = coding->src_object;
7698   coding->dst_pos = PT;
7699   coding->dst_pos_byte = PT_BYTE;
7700
7701   encode_coding (coding);
7702
7703   unbind_to (count, Qnil);
7704   return coding->result;
7705 }
7706
7707
7708 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7709    SRC_OBJECT into DST_OBJECT by coding context CODING.
7710
7711    SRC_OBJECT is a buffer, a string, or Qnil.
7712
7713    If it is a buffer, the text is at point of the buffer.  FROM and TO
7714    are positions in the buffer.
7715
7716    If it is a string, the text is at the beginning of the string.
7717    FROM and TO are indices to the string.
7718
7719    If it is nil, the text is at coding->source.  FROM and TO are
7720    indices to coding->source.
7721
7722    DST_OBJECT is a buffer, Qt, or Qnil.
7723
7724    If it is a buffer, the decoded text is inserted at point of the
7725    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7726    is deleted.
7727
7728    If it is Qt, a string is made from the decoded text, and
7729    set in CODING->dst_object.
7730
7731    If it is Qnil, the decoded text is stored at CODING->destination.
7732    The caller must allocate CODING->dst_bytes bytes at
7733    CODING->destination by xmalloc.  If the decoded text is longer than
7734    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7735  */
7736
7737 void
7738 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7739                       dst_object)
7740      struct coding_system *coding;
7741      Lisp_Object src_object;
7742      EMACS_INT from, from_byte, to, to_byte;
7743      Lisp_Object dst_object;
7744 {
7745   int count = specpdl_ptr - specpdl;
7746   unsigned char *destination;
7747   EMACS_INT dst_bytes;
7748   EMACS_INT chars = to - from;
7749   EMACS_INT bytes = to_byte - from_byte;
7750   Lisp_Object attrs;
7751   int saved_pt = -1, saved_pt_byte;
7752   int need_marker_adjustment = 0;
7753   Lisp_Object old_deactivate_mark;
7754
7755   old_deactivate_mark = Vdeactivate_mark;
7756
7757   if (NILP (dst_object))
7758     {
7759       destination = coding->destination;
7760       dst_bytes = coding->dst_bytes;
7761     }
7762
7763   coding->src_object = src_object;
7764   coding->src_chars = chars;
7765   coding->src_bytes = bytes;
7766   coding->src_multibyte = chars < bytes;
7767
7768   if (STRINGP (src_object))
7769     {
7770       coding->src_pos = from;
7771       coding->src_pos_byte = from_byte;
7772     }
7773   else if (BUFFERP (src_object))
7774     {
7775       set_buffer_internal (XBUFFER (src_object));
7776       if (from != GPT)
7777         move_gap_both (from, from_byte);
7778       if (EQ (src_object, dst_object))
7779         {
7780           struct Lisp_Marker *tail;
7781
7782           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7783             {
7784               tail->need_adjustment
7785                 = tail->charpos == (tail->insertion_type ? from : to);
7786               need_marker_adjustment |= tail->need_adjustment;
7787             }
7788           saved_pt = PT, saved_pt_byte = PT_BYTE;
7789           TEMP_SET_PT_BOTH (from, from_byte);
7790           current_buffer->text->inhibit_shrinking = 1;
7791           del_range_both (from, from_byte, to, to_byte, 1);
7792           coding->src_pos = -chars;
7793           coding->src_pos_byte = -bytes;
7794         }
7795       else
7796         {
7797           coding->src_pos = from;
7798           coding->src_pos_byte = from_byte;
7799         }
7800     }
7801
7802   if (CODING_REQUIRE_DETECTION (coding))
7803     detect_coding (coding);
7804   attrs = CODING_ID_ATTRS (coding->id);
7805
7806   if (EQ (dst_object, Qt)
7807       || (! NILP (CODING_ATTR_POST_READ (attrs))
7808           && NILP (dst_object)))
7809     {
7810       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7811       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7812       coding->dst_pos = BEG;
7813       coding->dst_pos_byte = BEG_BYTE;
7814     }
7815   else if (BUFFERP (dst_object))
7816     {
7817       code_conversion_save (0, 0);
7818       coding->dst_object = dst_object;
7819       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7820       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7821       coding->dst_multibyte
7822         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7823     }
7824   else
7825     {
7826       code_conversion_save (0, 0);
7827       coding->dst_object = Qnil;
7828       /* Most callers presume this will return a multibyte result, and they
7829          won't use `binary' or `raw-text' anyway, so let's not worry about
7830          CODING_FOR_UNIBYTE.  */
7831       coding->dst_multibyte = 1;
7832     }
7833
7834   decode_coding (coding);
7835
7836   if (BUFFERP (coding->dst_object))
7837     set_buffer_internal (XBUFFER (coding->dst_object));
7838
7839   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7840     {
7841       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7842       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7843       Lisp_Object val;
7844
7845       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7846       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7847               old_deactivate_mark);
7848       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7849                         make_number (coding->produced_char));
7850       UNGCPRO;
7851       CHECK_NATNUM (val);
7852       coding->produced_char += Z - prev_Z;
7853       coding->produced += Z_BYTE - prev_Z_BYTE;
7854     }
7855
7856   if (EQ (dst_object, Qt))
7857     {
7858       coding->dst_object = Fbuffer_string ();
7859     }
7860   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7861     {
7862       set_buffer_internal (XBUFFER (coding->dst_object));
7863       if (dst_bytes < coding->produced)
7864         {
7865           destination = xrealloc (destination, coding->produced);
7866           if (! destination)
7867             {
7868               record_conversion_result (coding,
7869                                         CODING_RESULT_INSUFFICIENT_DST);
7870               unbind_to (count, Qnil);
7871               return;
7872             }
7873           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7874             move_gap_both (BEGV, BEGV_BYTE);
7875           bcopy (BEGV_ADDR, destination, coding->produced);
7876           coding->destination = destination;
7877         }
7878     }
7879
7880   if (saved_pt >= 0)
7881     {
7882       /* This is the case of:
7883          (BUFFERP (src_object) && EQ (src_object, dst_object))
7884          As we have moved PT while replacing the original buffer
7885          contents, we must recover it now.  */
7886       set_buffer_internal (XBUFFER (src_object));
7887       current_buffer->text->inhibit_shrinking = 0;
7888       if (saved_pt < from)
7889         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7890       else if (saved_pt < from + chars)
7891         TEMP_SET_PT_BOTH (from, from_byte);
7892       else if (! NILP (current_buffer->enable_multibyte_characters))
7893         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7894                           saved_pt_byte + (coding->produced - bytes));
7895       else
7896         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7897                           saved_pt_byte + (coding->produced - bytes));
7898
7899       if (need_marker_adjustment)
7900         {
7901           struct Lisp_Marker *tail;
7902
7903           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7904             if (tail->need_adjustment)
7905               {
7906                 tail->need_adjustment = 0;
7907                 if (tail->insertion_type)
7908                   {
7909                     tail->bytepos = from_byte;
7910                     tail->charpos = from;
7911                   }
7912                 else
7913                   {
7914                     tail->bytepos = from_byte + coding->produced;
7915                     tail->charpos
7916                       = (NILP (current_buffer->enable_multibyte_characters)
7917                          ? tail->bytepos : from + coding->produced_char);
7918                   }
7919               }
7920         }
7921     }
7922
7923   Vdeactivate_mark = old_deactivate_mark;
7924   unbind_to (count, coding->dst_object);
7925 }
7926
7927
7928 void
7929 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7930                       dst_object)
7931      struct coding_system *coding;
7932      Lisp_Object src_object;
7933      EMACS_INT from, from_byte, to, to_byte;
7934      Lisp_Object dst_object;
7935 {
7936   int count = specpdl_ptr - specpdl;
7937   EMACS_INT chars = to - from;
7938   EMACS_INT bytes = to_byte - from_byte;
7939   Lisp_Object attrs;
7940   int saved_pt = -1, saved_pt_byte;
7941   int need_marker_adjustment = 0;
7942   int kill_src_buffer = 0;
7943   Lisp_Object old_deactivate_mark;
7944
7945   old_deactivate_mark = Vdeactivate_mark;
7946
7947   coding->src_object = src_object;
7948   coding->src_chars = chars;
7949   coding->src_bytes = bytes;
7950   coding->src_multibyte = chars < bytes;
7951
7952   attrs = CODING_ID_ATTRS (coding->id);
7953
7954   if (EQ (src_object, dst_object))
7955     {
7956       struct Lisp_Marker *tail;
7957
7958       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7959         {
7960           tail->need_adjustment
7961             = tail->charpos == (tail->insertion_type ? from : to);
7962           need_marker_adjustment |= tail->need_adjustment;
7963         }
7964     }
7965
7966   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7967     {
7968       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7969       set_buffer_internal (XBUFFER (coding->src_object));
7970       if (STRINGP (src_object))
7971         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7972       else if (BUFFERP (src_object))
7973         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7974       else
7975         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7976
7977       if (EQ (src_object, dst_object))
7978         {
7979           set_buffer_internal (XBUFFER (src_object));
7980           saved_pt = PT, saved_pt_byte = PT_BYTE;
7981           del_range_both (from, from_byte, to, to_byte, 1);
7982           set_buffer_internal (XBUFFER (coding->src_object));
7983         }
7984
7985       {
7986         Lisp_Object args[3];
7987         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7988
7989         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7990                 old_deactivate_mark);
7991         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7992         args[1] = make_number (BEG);
7993         args[2] = make_number (Z);
7994         safe_call (3, args);
7995         UNGCPRO;
7996       }
7997       if (XBUFFER (coding->src_object) != current_buffer)
7998         kill_src_buffer = 1;
7999       coding->src_object = Fcurrent_buffer ();
8000       if (BEG != GPT)
8001         move_gap_both (BEG, BEG_BYTE);
8002       coding->src_chars = Z - BEG;
8003       coding->src_bytes = Z_BYTE - BEG_BYTE;
8004       coding->src_pos = BEG;
8005       coding->src_pos_byte = BEG_BYTE;
8006       coding->src_multibyte = Z < Z_BYTE;
8007     }
8008   else if (STRINGP (src_object))
8009     {
8010       code_conversion_save (0, 0);
8011       coding->src_pos = from;
8012       coding->src_pos_byte = from_byte;
8013     }
8014   else if (BUFFERP (src_object))
8015     {
8016       code_conversion_save (0, 0);
8017       set_buffer_internal (XBUFFER (src_object));
8018       if (EQ (src_object, dst_object))
8019         {
8020           saved_pt = PT, saved_pt_byte = PT_BYTE;
8021           coding->src_object = del_range_1 (from, to, 1, 1);
8022           coding->src_pos = 0;
8023           coding->src_pos_byte = 0;
8024         }
8025       else
8026         {
8027           if (from < GPT && to >= GPT)
8028             move_gap_both (from, from_byte);
8029           coding->src_pos = from;
8030           coding->src_pos_byte = from_byte;
8031         }
8032     }
8033   else
8034     code_conversion_save (0, 0);
8035
8036   if (BUFFERP (dst_object))
8037     {
8038       coding->dst_object = dst_object;
8039       if (EQ (src_object, dst_object))
8040         {
8041           coding->dst_pos = from;
8042           coding->dst_pos_byte = from_byte;
8043         }
8044       else
8045         {
8046           struct buffer *current = current_buffer;
8047
8048           set_buffer_temp (XBUFFER (dst_object));
8049           coding->dst_pos = PT;
8050           coding->dst_pos_byte = PT_BYTE;
8051           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8052           set_buffer_temp (current);
8053         }
8054       coding->dst_multibyte
8055         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8056     }
8057   else if (EQ (dst_object, Qt))
8058     {
8059       coding->dst_object = Qnil;
8060       coding->dst_bytes = coding->src_chars;
8061       if (coding->dst_bytes == 0)
8062         coding->dst_bytes = 1;
8063       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8064       coding->dst_multibyte = 0;
8065     }
8066   else
8067     {
8068       coding->dst_object = Qnil;
8069       coding->dst_multibyte = 0;
8070     }
8071
8072   encode_coding (coding);
8073
8074   if (EQ (dst_object, Qt))
8075     {
8076       if (BUFFERP (coding->dst_object))
8077         coding->dst_object = Fbuffer_string ();
8078       else
8079         {
8080           coding->dst_object
8081             = make_unibyte_string ((char *) coding->destination,
8082                                    coding->produced);
8083           xfree (coding->destination);
8084         }
8085     }
8086
8087   if (saved_pt >= 0)
8088     {
8089       /* This is the case of:
8090          (BUFFERP (src_object) && EQ (src_object, dst_object))
8091          As we have moved PT while replacing the original buffer
8092          contents, we must recover it now.  */
8093       set_buffer_internal (XBUFFER (src_object));
8094       if (saved_pt < from)
8095         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8096       else if (saved_pt < from + chars)
8097         TEMP_SET_PT_BOTH (from, from_byte);
8098       else if (! NILP (current_buffer->enable_multibyte_characters))
8099         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8100                           saved_pt_byte + (coding->produced - bytes));
8101       else
8102         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8103                           saved_pt_byte + (coding->produced - bytes));
8104
8105       if (need_marker_adjustment)
8106         {
8107           struct Lisp_Marker *tail;
8108
8109           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8110             if (tail->need_adjustment)
8111               {
8112                 tail->need_adjustment = 0;
8113                 if (tail->insertion_type)
8114                   {
8115                     tail->bytepos = from_byte;
8116                     tail->charpos = from;
8117                   }
8118                 else
8119                   {
8120                     tail->bytepos = from_byte + coding->produced;
8121                     tail->charpos
8122                       = (NILP (current_buffer->enable_multibyte_characters)
8123                          ? tail->bytepos : from + coding->produced_char);
8124                   }
8125               }
8126         }
8127     }
8128
8129   if (kill_src_buffer)
8130     Fkill_buffer (coding->src_object);
8131
8132   Vdeactivate_mark = old_deactivate_mark;
8133   unbind_to (count, Qnil);
8134 }
8135
8136
8137 Lisp_Object
8138 preferred_coding_system ()
8139 {
8140   int id = coding_categories[coding_priorities[0]].id;
8141
8142   return CODING_ID_NAME (id);
8143 }
8144
8145 \f
8146 #ifdef emacs
8147 /*** 8. Emacs Lisp library functions ***/
8148
8149 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8150        doc: /* Return t if OBJECT is nil or a coding-system.
8151 See the documentation of `define-coding-system' for information
8152 about coding-system objects.  */)
8153      (object)
8154      Lisp_Object object;
8155 {
8156   if (NILP (object)
8157       || CODING_SYSTEM_ID (object) >= 0)
8158     return Qt;
8159   if (! SYMBOLP (object)
8160       || NILP (Fget (object, Qcoding_system_define_form)))
8161     return Qnil;
8162   return Qt;
8163 }
8164
8165 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8166        Sread_non_nil_coding_system, 1, 1, 0,
8167        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8168      (prompt)
8169      Lisp_Object prompt;
8170 {
8171   Lisp_Object val;
8172   do
8173     {
8174       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8175                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8176     }
8177   while (SCHARS (val) == 0);
8178   return (Fintern (val, Qnil));
8179 }
8180
8181 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8182        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8183 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8184 Ignores case when completing coding systems (all Emacs coding systems
8185 are lower-case).  */)
8186      (prompt, default_coding_system)
8187      Lisp_Object prompt, default_coding_system;
8188 {
8189   Lisp_Object val;
8190   int count = SPECPDL_INDEX ();
8191
8192   if (SYMBOLP (default_coding_system))
8193     default_coding_system = SYMBOL_NAME (default_coding_system);
8194   specbind (Qcompletion_ignore_case, Qt);
8195   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8196                           Qt, Qnil, Qcoding_system_history,
8197                           default_coding_system, Qnil);
8198   unbind_to (count, Qnil);
8199   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8200 }
8201
8202 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8203        1, 1, 0,
8204        doc: /* Check validity of CODING-SYSTEM.
8205 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8206 It is valid if it is nil or a symbol defined as a coding system by the
8207 function `define-coding-system'.  */)
8208   (coding_system)
8209      Lisp_Object coding_system;
8210 {
8211   Lisp_Object define_form;
8212
8213   define_form = Fget (coding_system, Qcoding_system_define_form);
8214   if (! NILP (define_form))
8215     {
8216       Fput (coding_system, Qcoding_system_define_form, Qnil);
8217       safe_eval (define_form);
8218     }
8219   if (!NILP (Fcoding_system_p (coding_system)))
8220     return coding_system;
8221   xsignal1 (Qcoding_system_error, coding_system);
8222 }
8223
8224 \f
8225 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8226    HIGHEST is nonzero, return the coding system of the highest
8227    priority among the detected coding systems.  Otherwize return a
8228    list of detected coding systems sorted by their priorities.  If
8229    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8230    multibyte form but contains only ASCII and eight-bit chars.
8231    Otherwise, the bytes are raw bytes.
8232
8233    CODING-SYSTEM controls the detection as below:
8234
8235    If it is nil, detect both text-format and eol-format.  If the
8236    text-format part of CODING-SYSTEM is already specified
8237    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8238    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8239    detect only text-format.  */
8240
8241 Lisp_Object
8242 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8243                       coding_system)
8244      const unsigned char *src;
8245      EMACS_INT src_chars, src_bytes;
8246      int highest;
8247      int multibytep;
8248      Lisp_Object coding_system;
8249 {
8250   const unsigned char *src_end = src + src_bytes;
8251   Lisp_Object attrs, eol_type;
8252   Lisp_Object val = Qnil;
8253   struct coding_system coding;
8254   int id;
8255   struct coding_detection_info detect_info;
8256   enum coding_category base_category;
8257   int null_byte_found = 0, eight_bit_found = 0;
8258
8259   if (NILP (coding_system))
8260     coding_system = Qundecided;
8261   setup_coding_system (coding_system, &coding);
8262   attrs = CODING_ID_ATTRS (coding.id);
8263   eol_type = CODING_ID_EOL_TYPE (coding.id);
8264   coding_system = CODING_ATTR_BASE_NAME (attrs);
8265
8266   coding.source = src;
8267   coding.src_chars = src_chars;
8268   coding.src_bytes = src_bytes;
8269   coding.src_multibyte = multibytep;
8270   coding.consumed = 0;
8271   coding.mode |= CODING_MODE_LAST_BLOCK;
8272   coding.head_ascii = 0;
8273
8274   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8275
8276   /* At first, detect text-format if necessary.  */
8277   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8278   if (base_category == coding_category_undecided)
8279     {
8280       enum coding_category category;
8281       struct coding_system *this;
8282       int c, i;
8283
8284       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8285       for (; src < src_end; src++)
8286         {
8287           c = *src;
8288           if (c & 0x80)
8289             {
8290               eight_bit_found = 1;
8291               if (null_byte_found)
8292                 break;
8293             }
8294           else if (c < 0x20)
8295             {
8296               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8297                   && ! inhibit_iso_escape_detection
8298                   && ! detect_info.checked)
8299                 {
8300                   if (detect_coding_iso_2022 (&coding, &detect_info))
8301                     {
8302                       /* We have scanned the whole data.  */
8303                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8304                         {
8305                           /* We didn't find an 8-bit code.  We may
8306                              have found a null-byte, but it's very
8307                              rare that a binary file confirm to
8308                              ISO-2022.  */
8309                           src = src_end;
8310                           coding.head_ascii = src - coding.source;
8311                         }
8312                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8313                       break;
8314                     }
8315                 }
8316               else if (! c && !inhibit_null_byte_detection)
8317                 {
8318                   null_byte_found = 1;
8319                   if (eight_bit_found)
8320                     break;
8321                 }
8322               if (! eight_bit_found)
8323                 coding.head_ascii++;
8324             }
8325           else if (! eight_bit_found)
8326             coding.head_ascii++;
8327         }
8328
8329       if (null_byte_found || eight_bit_found
8330           || coding.head_ascii < coding.src_bytes
8331           || detect_info.found)
8332         {
8333           if (coding.head_ascii == coding.src_bytes)
8334             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8335             for (i = 0; i < coding_category_raw_text; i++)
8336               {
8337                 category = coding_priorities[i];
8338                 this = coding_categories + category;
8339                 if (detect_info.found & (1 << category))
8340                   break;
8341               }
8342           else
8343             {
8344               if (null_byte_found)
8345                 {
8346                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8347                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8348                 }
8349               for (i = 0; i < coding_category_raw_text; i++)
8350                 {
8351                   category = coding_priorities[i];
8352                   this = coding_categories + category;
8353
8354                   if (this->id < 0)
8355                     {
8356                       /* No coding system of this category is defined.  */
8357                       detect_info.rejected |= (1 << category);
8358                     }
8359                   else if (category >= coding_category_raw_text)
8360                     continue;
8361                   else if (detect_info.checked & (1 << category))
8362                     {
8363                       if (highest
8364                           && (detect_info.found & (1 << category)))
8365                         break;
8366                     }
8367                   else if ((*(this->detector)) (&coding, &detect_info)
8368                            && highest
8369                            && (detect_info.found & (1 << category)))
8370                     {
8371                       if (category == coding_category_utf_16_auto)
8372                         {
8373                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8374                             category = coding_category_utf_16_le;
8375                           else
8376                             category = coding_category_utf_16_be;
8377                         }
8378                       break;
8379                     }
8380                 }
8381             }
8382         }
8383
8384       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8385           || null_byte_found)
8386         {
8387           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8388           id = CODING_SYSTEM_ID (Qno_conversion);
8389           val = Fcons (make_number (id), Qnil);
8390         }
8391       else if (! detect_info.rejected && ! detect_info.found)
8392         {
8393           detect_info.found = CATEGORY_MASK_ANY;
8394           id = coding_categories[coding_category_undecided].id;
8395           val = Fcons (make_number (id), Qnil);
8396         }
8397       else if (highest)
8398         {
8399           if (detect_info.found)
8400             {
8401               detect_info.found = 1 << category;
8402               val = Fcons (make_number (this->id), Qnil);
8403             }
8404           else
8405             for (i = 0; i < coding_category_raw_text; i++)
8406               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8407                 {
8408                   detect_info.found = 1 << coding_priorities[i];
8409                   id = coding_categories[coding_priorities[i]].id;
8410                   val = Fcons (make_number (id), Qnil);
8411                   break;
8412                 }
8413         }
8414       else
8415         {
8416           int mask = detect_info.rejected | detect_info.found;
8417           int found = 0;
8418
8419           for (i = coding_category_raw_text - 1; i >= 0; i--)
8420             {
8421               category = coding_priorities[i];
8422               if (! (mask & (1 << category)))
8423                 {
8424                   found |= 1 << category;
8425                   id = coding_categories[category].id;
8426                   if (id >= 0)
8427                     val = Fcons (make_number (id), val);
8428                 }
8429             }
8430           for (i = coding_category_raw_text - 1; i >= 0; i--)
8431             {
8432               category = coding_priorities[i];
8433               if (detect_info.found & (1 << category))
8434                 {
8435                   id = coding_categories[category].id;
8436                   val = Fcons (make_number (id), val);
8437                 }
8438             }
8439           detect_info.found |= found;
8440         }
8441     }
8442   else if (base_category == coding_category_utf_8_auto)
8443     {
8444       if (detect_coding_utf_8 (&coding, &detect_info))
8445         {
8446           struct coding_system *this;
8447
8448           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8449             this = coding_categories + coding_category_utf_8_sig;
8450           else
8451             this = coding_categories + coding_category_utf_8_nosig;
8452           val = Fcons (make_number (this->id), Qnil);
8453         }
8454     }
8455   else if (base_category == coding_category_utf_16_auto)
8456     {
8457       if (detect_coding_utf_16 (&coding, &detect_info))
8458         {
8459           struct coding_system *this;
8460
8461           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8462             this = coding_categories + coding_category_utf_16_le;
8463           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8464             this = coding_categories + coding_category_utf_16_be;
8465           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8466             this = coding_categories + coding_category_utf_16_be_nosig;
8467           else
8468             this = coding_categories + coding_category_utf_16_le_nosig;
8469           val = Fcons (make_number (this->id), Qnil);
8470         }
8471     }
8472   else
8473     {
8474       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8475       val = Fcons (make_number (coding.id), Qnil);
8476     }
8477
8478   /* Then, detect eol-format if necessary.  */
8479   {
8480     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8481     Lisp_Object tail;
8482
8483     if (VECTORP (eol_type))
8484       {
8485         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8486           {
8487             if (null_byte_found)
8488               normal_eol = EOL_SEEN_LF;
8489             else
8490               normal_eol = detect_eol (coding.source, src_bytes,
8491                                        coding_category_raw_text);
8492           }
8493         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8494                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8495           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8496                                       coding_category_utf_16_be);
8497         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8498                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8499           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8500                                       coding_category_utf_16_le);
8501       }
8502     else
8503       {
8504         if (EQ (eol_type, Qunix))
8505           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8506         else if (EQ (eol_type, Qdos))
8507           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8508         else
8509           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8510       }
8511
8512     for (tail = val; CONSP (tail); tail = XCDR (tail))
8513       {
8514         enum coding_category category;
8515         int this_eol;
8516
8517         id = XINT (XCAR (tail));
8518         attrs = CODING_ID_ATTRS (id);
8519         category = XINT (CODING_ATTR_CATEGORY (attrs));
8520         eol_type = CODING_ID_EOL_TYPE (id);
8521         if (VECTORP (eol_type))
8522           {
8523             if (category == coding_category_utf_16_be
8524                 || category == coding_category_utf_16_be_nosig)
8525               this_eol = utf_16_be_eol;
8526             else if (category == coding_category_utf_16_le
8527                      || category == coding_category_utf_16_le_nosig)
8528               this_eol = utf_16_le_eol;
8529             else
8530               this_eol = normal_eol;
8531
8532             if (this_eol == EOL_SEEN_LF)
8533               XSETCAR (tail, AREF (eol_type, 0));
8534             else if (this_eol == EOL_SEEN_CRLF)
8535               XSETCAR (tail, AREF (eol_type, 1));
8536             else if (this_eol == EOL_SEEN_CR)
8537               XSETCAR (tail, AREF (eol_type, 2));
8538             else
8539               XSETCAR (tail, CODING_ID_NAME (id));
8540           }
8541         else
8542           XSETCAR (tail, CODING_ID_NAME (id));
8543       }
8544   }
8545
8546   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8547 }
8548
8549
8550 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8551        2, 3, 0,
8552        doc: /* Detect coding system of the text in the region between START and END.
8553 Return a list of possible coding systems ordered by priority.
8554 The coding systems to try and their priorities follows what
8555 the function `coding-system-priority-list' (which see) returns.
8556
8557 If only ASCII characters are found (except for such ISO-2022 control
8558 characters as ESC), it returns a list of single element `undecided'
8559 or its subsidiary coding system according to a detected end-of-line
8560 format.
8561
8562 If optional argument HIGHEST is non-nil, return the coding system of
8563 highest priority.  */)
8564      (start, end, highest)
8565      Lisp_Object start, end, highest;
8566 {
8567   int from, to;
8568   int from_byte, to_byte;
8569
8570   CHECK_NUMBER_COERCE_MARKER (start);
8571   CHECK_NUMBER_COERCE_MARKER (end);
8572
8573   validate_region (&start, &end);
8574   from = XINT (start), to = XINT (end);
8575   from_byte = CHAR_TO_BYTE (from);
8576   to_byte = CHAR_TO_BYTE (to);
8577
8578   if (from < GPT && to >= GPT)
8579     move_gap_both (to, to_byte);
8580
8581   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8582                                to - from, to_byte - from_byte,
8583                                !NILP (highest),
8584                                !NILP (current_buffer
8585                                       ->enable_multibyte_characters),
8586                                Qnil);
8587 }
8588
8589 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8590        1, 2, 0,
8591        doc: /* Detect coding system of the text in STRING.
8592 Return a list of possible coding systems ordered by priority.
8593 The coding systems to try and their priorities follows what
8594 the function `coding-system-priority-list' (which see) returns.
8595
8596 If only ASCII characters are found (except for such ISO-2022 control
8597 characters as ESC), it returns a list of single element `undecided'
8598 or its subsidiary coding system according to a detected end-of-line
8599 format.
8600
8601 If optional argument HIGHEST is non-nil, return the coding system of
8602 highest priority.  */)
8603      (string, highest)
8604      Lisp_Object string, highest;
8605 {
8606   CHECK_STRING (string);
8607
8608   return detect_coding_system (SDATA (string),
8609                                SCHARS (string), SBYTES (string),
8610                                !NILP (highest), STRING_MULTIBYTE (string),
8611                                Qnil);
8612 }
8613
8614
8615 static INLINE int
8616 char_encodable_p (c, attrs)
8617      int c;
8618      Lisp_Object attrs;
8619 {
8620   Lisp_Object tail;
8621   struct charset *charset;
8622   Lisp_Object translation_table;
8623
8624   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8625   if (! NILP (translation_table))
8626     c = translate_char (translation_table, c);
8627   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8628        CONSP (tail); tail = XCDR (tail))
8629     {
8630       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8631       if (CHAR_CHARSET_P (c, charset))
8632         break;
8633     }
8634   return (! NILP (tail));
8635 }
8636
8637
8638 /* Return a list of coding systems that safely encode the text between
8639    START and END.  If EXCLUDE is non-nil, it is a list of coding
8640    systems not to check.  The returned list doesn't contain any such
8641    coding systems.  In any case, if the text contains only ASCII or is
8642    unibyte, return t.  */
8643
8644 DEFUN ("find-coding-systems-region-internal",
8645        Ffind_coding_systems_region_internal,
8646        Sfind_coding_systems_region_internal, 2, 3, 0,
8647        doc: /* Internal use only.  */)
8648      (start, end, exclude)
8649      Lisp_Object start, end, exclude;
8650 {
8651   Lisp_Object coding_attrs_list, safe_codings;
8652   EMACS_INT start_byte, end_byte;
8653   const unsigned char *p, *pbeg, *pend;
8654   int c;
8655   Lisp_Object tail, elt, work_table;
8656
8657   if (STRINGP (start))
8658     {
8659       if (!STRING_MULTIBYTE (start)
8660           || SCHARS (start) == SBYTES (start))
8661         return Qt;
8662       start_byte = 0;
8663       end_byte = SBYTES (start);
8664     }
8665   else
8666     {
8667       CHECK_NUMBER_COERCE_MARKER (start);
8668       CHECK_NUMBER_COERCE_MARKER (end);
8669       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8670         args_out_of_range (start, end);
8671       if (NILP (current_buffer->enable_multibyte_characters))
8672         return Qt;
8673       start_byte = CHAR_TO_BYTE (XINT (start));
8674       end_byte = CHAR_TO_BYTE (XINT (end));
8675       if (XINT (end) - XINT (start) == end_byte - start_byte)
8676         return Qt;
8677
8678       if (XINT (start) < GPT && XINT (end) > GPT)
8679         {
8680           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8681             move_gap_both (XINT (start), start_byte);
8682           else
8683             move_gap_both (XINT (end), end_byte);
8684         }
8685     }
8686
8687   coding_attrs_list = Qnil;
8688   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8689     if (NILP (exclude)
8690         || NILP (Fmemq (XCAR (tail), exclude)))
8691       {
8692         Lisp_Object attrs;
8693
8694         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8695         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8696             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8697           {
8698             ASET (attrs, coding_attr_trans_tbl,
8699                   get_translation_table (attrs, 1, NULL));
8700             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8701           }
8702       }
8703
8704   if (STRINGP (start))
8705     p = pbeg = SDATA (start);
8706   else
8707     p = pbeg = BYTE_POS_ADDR (start_byte);
8708   pend = p + (end_byte - start_byte);
8709
8710   while (p < pend && ASCII_BYTE_P (*p)) p++;
8711   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8712
8713   work_table = Fmake_char_table (Qnil, Qnil);
8714   while (p < pend)
8715     {
8716       if (ASCII_BYTE_P (*p))
8717         p++;
8718       else
8719         {
8720           c = STRING_CHAR_ADVANCE (p);
8721           if (!NILP (char_table_ref (work_table, c)))
8722             /* This character was already checked.  Ignore it.  */
8723             continue;
8724
8725           charset_map_loaded = 0;
8726           for (tail = coding_attrs_list; CONSP (tail);)
8727             {
8728               elt = XCAR (tail);
8729               if (NILP (elt))
8730                 tail = XCDR (tail);
8731               else if (char_encodable_p (c, elt))
8732                 tail = XCDR (tail);
8733               else if (CONSP (XCDR (tail)))
8734                 {
8735                   XSETCAR (tail, XCAR (XCDR (tail)));
8736                   XSETCDR (tail, XCDR (XCDR (tail)));
8737                 }
8738               else
8739                 {
8740                   XSETCAR (tail, Qnil);
8741                   tail = XCDR (tail);
8742                 }
8743             }
8744           if (charset_map_loaded)
8745             {
8746               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8747
8748               if (STRINGP (start))
8749                 pbeg = SDATA (start);
8750               else
8751                 pbeg = BYTE_POS_ADDR (start_byte);
8752               p = pbeg + p_offset;
8753               pend = pbeg + pend_offset;
8754             }
8755           char_table_set (work_table, c, Qt);
8756         }
8757     }
8758
8759   safe_codings = list2 (Qraw_text, Qno_conversion);
8760   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8761     if (! NILP (XCAR (tail)))
8762       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8763
8764   return safe_codings;
8765 }
8766
8767
8768 DEFUN ("unencodable-char-position", Funencodable_char_position,
8769        Sunencodable_char_position, 3, 5, 0,
8770        doc: /*
8771 Return position of first un-encodable character in a region.
8772 START and END specify the region and CODING-SYSTEM specifies the
8773 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8774
8775 If optional 4th argument COUNT is non-nil, it specifies at most how
8776 many un-encodable characters to search.  In this case, the value is a
8777 list of positions.
8778
8779 If optional 5th argument STRING is non-nil, it is a string to search
8780 for un-encodable characters.  In that case, START and END are indexes
8781 to the string.  */)
8782      (start, end, coding_system, count, string)
8783      Lisp_Object start, end, coding_system, count, string;
8784 {
8785   int n;
8786   struct coding_system coding;
8787   Lisp_Object attrs, charset_list, translation_table;
8788   Lisp_Object positions;
8789   int from, to;
8790   const unsigned char *p, *stop, *pend;
8791   int ascii_compatible;
8792
8793   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8794   attrs = CODING_ID_ATTRS (coding.id);
8795   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8796     return Qnil;
8797   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8798   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8799   translation_table = get_translation_table (attrs, 1, NULL);
8800
8801   if (NILP (string))
8802     {
8803       validate_region (&start, &end);
8804       from = XINT (start);
8805       to = XINT (end);
8806       if (NILP (current_buffer->enable_multibyte_characters)
8807           || (ascii_compatible
8808               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8809         return Qnil;
8810       p = CHAR_POS_ADDR (from);
8811       pend = CHAR_POS_ADDR (to);
8812       if (from < GPT && to >= GPT)
8813         stop = GPT_ADDR;
8814       else
8815         stop = pend;
8816     }
8817   else
8818     {
8819       CHECK_STRING (string);
8820       CHECK_NATNUM (start);
8821       CHECK_NATNUM (end);
8822       from = XINT (start);
8823       to = XINT (end);
8824       if (from > to
8825           || to > SCHARS (string))
8826         args_out_of_range_3 (string, start, end);
8827       if (! STRING_MULTIBYTE (string))
8828         return Qnil;
8829       p = SDATA (string) + string_char_to_byte (string, from);
8830       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8831       if (ascii_compatible && (to - from) == (pend - p))
8832         return Qnil;
8833     }
8834
8835   if (NILP (count))
8836     n = 1;
8837   else
8838     {
8839       CHECK_NATNUM (count);
8840       n = XINT (count);
8841     }
8842
8843   positions = Qnil;
8844   while (1)
8845     {
8846       int c;
8847
8848       if (ascii_compatible)
8849         while (p < stop && ASCII_BYTE_P (*p))
8850           p++, from++;
8851       if (p >= stop)
8852         {
8853           if (p >= pend)
8854             break;
8855           stop = pend;
8856           p = GAP_END_ADDR;
8857         }
8858
8859       c = STRING_CHAR_ADVANCE (p);
8860       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8861           && ! char_charset (translate_char (translation_table, c),
8862                              charset_list, NULL))
8863         {
8864           positions = Fcons (make_number (from), positions);
8865           n--;
8866           if (n == 0)
8867             break;
8868         }
8869
8870       from++;
8871     }
8872
8873   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8874 }
8875
8876
8877 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8878        Scheck_coding_systems_region, 3, 3, 0,
8879        doc: /* Check if the region is encodable by coding systems.
8880
8881 START and END are buffer positions specifying the region.
8882 CODING-SYSTEM-LIST is a list of coding systems to check.
8883
8884 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8885 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8886 whole region, POS0, POS1, ... are buffer positions where non-encodable
8887 characters are found.
8888
8889 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8890 value is nil.
8891
8892 START may be a string.  In that case, check if the string is
8893 encodable, and the value contains indices to the string instead of
8894 buffer positions.  END is ignored.
8895
8896 If the current buffer (or START if it is a string) is unibyte, the value
8897 is nil.  */)
8898      (start, end, coding_system_list)
8899      Lisp_Object start, end, coding_system_list;
8900 {
8901   Lisp_Object list;
8902   EMACS_INT start_byte, end_byte;
8903   int pos;
8904   const unsigned char *p, *pbeg, *pend;
8905   int c;
8906   Lisp_Object tail, elt, attrs;
8907
8908   if (STRINGP (start))
8909     {
8910       if (!STRING_MULTIBYTE (start)
8911           || SCHARS (start) == SBYTES (start))
8912         return Qnil;
8913       start_byte = 0;
8914       end_byte = SBYTES (start);
8915       pos = 0;
8916     }
8917   else
8918     {
8919       CHECK_NUMBER_COERCE_MARKER (start);
8920       CHECK_NUMBER_COERCE_MARKER (end);
8921       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8922         args_out_of_range (start, end);
8923       if (NILP (current_buffer->enable_multibyte_characters))
8924         return Qnil;
8925       start_byte = CHAR_TO_BYTE (XINT (start));
8926       end_byte = CHAR_TO_BYTE (XINT (end));
8927       if (XINT (end) - XINT (start) == end_byte - start_byte)
8928         return Qnil;
8929
8930       if (XINT (start) < GPT && XINT (end) > GPT)
8931         {
8932           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8933             move_gap_both (XINT (start), start_byte);
8934           else
8935             move_gap_both (XINT (end), end_byte);
8936         }
8937       pos = XINT (start);
8938     }
8939
8940   list = Qnil;
8941   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8942     {
8943       elt = XCAR (tail);
8944       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8945       ASET (attrs, coding_attr_trans_tbl,
8946             get_translation_table (attrs, 1, NULL));
8947       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8948     }
8949
8950   if (STRINGP (start))
8951     p = pbeg = SDATA (start);
8952   else
8953     p = pbeg = BYTE_POS_ADDR (start_byte);
8954   pend = p + (end_byte - start_byte);
8955
8956   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8957   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8958
8959   while (p < pend)
8960     {
8961       if (ASCII_BYTE_P (*p))
8962         p++;
8963       else
8964         {
8965           c = STRING_CHAR_ADVANCE (p);
8966
8967           charset_map_loaded = 0;
8968           for (tail = list; CONSP (tail); tail = XCDR (tail))
8969             {
8970               elt = XCDR (XCAR (tail));
8971               if (! char_encodable_p (c, XCAR (elt)))
8972                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8973             }
8974           if (charset_map_loaded)
8975             {
8976               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8977
8978               if (STRINGP (start))
8979                 pbeg = SDATA (start);
8980               else
8981                 pbeg = BYTE_POS_ADDR (start_byte);
8982               p = pbeg + p_offset;
8983               pend = pbeg + pend_offset;
8984             }
8985         }
8986       pos++;
8987     }
8988
8989   tail = list;
8990   list = Qnil;
8991   for (; CONSP (tail); tail = XCDR (tail))
8992     {
8993       elt = XCAR (tail);
8994       if (CONSP (XCDR (XCDR (elt))))
8995         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8996                       list);
8997     }
8998
8999   return list;
9000 }
9001
9002
9003 Lisp_Object
9004 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9005      Lisp_Object start, end, coding_system, dst_object;
9006      int encodep, norecord;
9007 {
9008   struct coding_system coding;
9009   EMACS_INT from, from_byte, to, to_byte;
9010   Lisp_Object src_object;
9011
9012   CHECK_NUMBER_COERCE_MARKER (start);
9013   CHECK_NUMBER_COERCE_MARKER (end);
9014   if (NILP (coding_system))
9015     coding_system = Qno_conversion;
9016   else
9017     CHECK_CODING_SYSTEM (coding_system);
9018   src_object = Fcurrent_buffer ();
9019   if (NILP (dst_object))
9020     dst_object = src_object;
9021   else if (! EQ (dst_object, Qt))
9022     CHECK_BUFFER (dst_object);
9023
9024   validate_region (&start, &end);
9025   from = XFASTINT (start);
9026   from_byte = CHAR_TO_BYTE (from);
9027   to = XFASTINT (end);
9028   to_byte = CHAR_TO_BYTE (to);
9029
9030   setup_coding_system (coding_system, &coding);
9031   coding.mode |= CODING_MODE_LAST_BLOCK;
9032
9033   if (encodep)
9034     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9035                           dst_object);
9036   else
9037     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9038                           dst_object);
9039   if (! norecord)
9040     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9041
9042   return (BUFFERP (dst_object)
9043           ? make_number (coding.produced_char)
9044           : coding.dst_object);
9045 }
9046
9047
9048 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9049        3, 4, "r\nzCoding system: ",
9050        doc: /* Decode the current region from the specified coding system.
9051 When called from a program, takes four arguments:
9052         START, END, CODING-SYSTEM, and DESTINATION.
9053 START and END are buffer positions.
9054
9055 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9056 If nil, the region between START and END is replaced by the decoded text.
9057 If buffer, the decoded text is inserted in that buffer after point (point
9058 does not move).
9059 In those cases, the length of the decoded text is returned.
9060 If DESTINATION is t, the decoded text is returned.
9061
9062 This function sets `last-coding-system-used' to the precise coding system
9063 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9064 not fully specified.)  */)
9065      (start, end, coding_system, destination)
9066      Lisp_Object start, end, coding_system, destination;
9067 {
9068   return code_convert_region (start, end, coding_system, destination, 0, 0);
9069 }
9070
9071 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9072        3, 4, "r\nzCoding system: ",
9073        doc: /* Encode the current region by specified coding system.
9074 When called from a program, takes four arguments:
9075         START, END, CODING-SYSTEM and DESTINATION.
9076 START and END are buffer positions.
9077
9078 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9079 If nil, the region between START and END is replace by the encoded text.
9080 If buffer, the encoded text is inserted in that buffer after point (point
9081 does not move).
9082 In those cases, the length of the encoded text is returned.
9083 If DESTINATION is t, the encoded text is returned.
9084
9085 This function sets `last-coding-system-used' to the precise coding system
9086 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9087 not fully specified.)  */)
9088   (start, end, coding_system, destination)
9089      Lisp_Object start, end, coding_system, destination;
9090 {
9091   return code_convert_region (start, end, coding_system, destination, 1, 0);
9092 }
9093
9094 Lisp_Object
9095 code_convert_string (string, coding_system, dst_object,
9096                      encodep, nocopy, norecord)
9097      Lisp_Object string, coding_system, dst_object;
9098      int encodep, nocopy, norecord;
9099 {
9100   struct coding_system coding;
9101   EMACS_INT chars, bytes;
9102
9103   CHECK_STRING (string);
9104   if (NILP (coding_system))
9105     {
9106       if (! norecord)
9107         Vlast_coding_system_used = Qno_conversion;
9108       if (NILP (dst_object))
9109         return (nocopy ? Fcopy_sequence (string) : string);
9110     }
9111
9112   if (NILP (coding_system))
9113     coding_system = Qno_conversion;
9114   else
9115     CHECK_CODING_SYSTEM (coding_system);
9116   if (NILP (dst_object))
9117     dst_object = Qt;
9118   else if (! EQ (dst_object, Qt))
9119     CHECK_BUFFER (dst_object);
9120
9121   setup_coding_system (coding_system, &coding);
9122   coding.mode |= CODING_MODE_LAST_BLOCK;
9123   chars = SCHARS (string);
9124   bytes = SBYTES (string);
9125   if (encodep)
9126     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9127   else
9128     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9129   if (! norecord)
9130     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9131
9132   return (BUFFERP (dst_object)
9133           ? make_number (coding.produced_char)
9134           : coding.dst_object);
9135 }
9136
9137
9138 /* Encode or decode STRING according to CODING_SYSTEM.
9139    Do not set Vlast_coding_system_used.
9140
9141    This function is called only from macros DECODE_FILE and
9142    ENCODE_FILE, thus we ignore character composition.  */
9143
9144 Lisp_Object
9145 code_convert_string_norecord (string, coding_system, encodep)
9146      Lisp_Object string, coding_system;
9147      int encodep;
9148 {
9149   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9150 }
9151
9152
9153 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9154        2, 4, 0,
9155        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9156
9157 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9158 if the decoding operation is trivial.
9159
9160 Optional fourth arg BUFFER non-nil means that the decoded text is
9161 inserted in that buffer after point (point does not move).  In this
9162 case, the return value is the length of the decoded text.
9163
9164 This function sets `last-coding-system-used' to the precise coding system
9165 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9166 not fully specified.)  */)
9167   (string, coding_system, nocopy, buffer)
9168      Lisp_Object string, coding_system, nocopy, buffer;
9169 {
9170   return code_convert_string (string, coding_system, buffer,
9171                               0, ! NILP (nocopy), 0);
9172 }
9173
9174 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9175        2, 4, 0,
9176        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9177
9178 Optional third arg NOCOPY non-nil means it is OK to return STRING
9179 itself if the encoding operation is trivial.
9180
9181 Optional fourth arg BUFFER non-nil means that the encoded text is
9182 inserted in that buffer after point (point does not move).  In this
9183 case, the return value is the length of the encoded text.
9184
9185 This function sets `last-coding-system-used' to the precise coding system
9186 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9187 not fully specified.)  */)
9188      (string, coding_system, nocopy, buffer)
9189      Lisp_Object string, coding_system, nocopy, buffer;
9190 {
9191   return code_convert_string (string, coding_system, buffer,
9192                               1, ! NILP (nocopy), 1);
9193 }
9194
9195 \f
9196 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9197        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9198 Return the corresponding character.  */)
9199      (code)
9200      Lisp_Object code;
9201 {
9202   Lisp_Object spec, attrs, val;
9203   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9204   int c;
9205
9206   CHECK_NATNUM (code);
9207   c = XFASTINT (code);
9208   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9209   attrs = AREF (spec, 0);
9210
9211   if (ASCII_BYTE_P (c)
9212       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9213     return code;
9214
9215   val = CODING_ATTR_CHARSET_LIST (attrs);
9216   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9217   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9218   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9219
9220   if (c <= 0x7F)
9221     charset = charset_roman;
9222   else if (c >= 0xA0 && c < 0xDF)
9223     {
9224       charset = charset_kana;
9225       c -= 0x80;
9226     }
9227   else
9228     {
9229       int s1 = c >> 8, s2 = c & 0xFF;
9230
9231       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9232           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9233         error ("Invalid code: %d", code);
9234       SJIS_TO_JIS (c);
9235       charset = charset_kanji;
9236     }
9237   c = DECODE_CHAR (charset, c);
9238   if (c < 0)
9239     error ("Invalid code: %d", code);
9240   return make_number (c);
9241 }
9242
9243
9244 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9245        doc: /* Encode a Japanese character CH to shift_jis encoding.
9246 Return the corresponding code in SJIS.  */)
9247      (ch)
9248     Lisp_Object ch;
9249 {
9250   Lisp_Object spec, attrs, charset_list;
9251   int c;
9252   struct charset *charset;
9253   unsigned code;
9254
9255   CHECK_CHARACTER (ch);
9256   c = XFASTINT (ch);
9257   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9258   attrs = AREF (spec, 0);
9259
9260   if (ASCII_CHAR_P (c)
9261       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9262     return ch;
9263
9264   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9265   charset = char_charset (c, charset_list, &code);
9266   if (code == CHARSET_INVALID_CODE (charset))
9267     error ("Can't encode by shift_jis encoding: %d", c);
9268   JIS_TO_SJIS (code);
9269
9270   return make_number (code);
9271 }
9272
9273 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9274        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9275 Return the corresponding character.  */)
9276      (code)
9277      Lisp_Object code;
9278 {
9279   Lisp_Object spec, attrs, val;
9280   struct charset *charset_roman, *charset_big5, *charset;
9281   int c;
9282
9283   CHECK_NATNUM (code);
9284   c = XFASTINT (code);
9285   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9286   attrs = AREF (spec, 0);
9287
9288   if (ASCII_BYTE_P (c)
9289       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9290     return code;
9291
9292   val = CODING_ATTR_CHARSET_LIST (attrs);
9293   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9294   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9295
9296   if (c <= 0x7F)
9297     charset = charset_roman;
9298   else
9299     {
9300       int b1 = c >> 8, b2 = c & 0x7F;
9301       if (b1 < 0xA1 || b1 > 0xFE
9302           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9303         error ("Invalid code: %d", code);
9304       charset = charset_big5;
9305     }
9306   c = DECODE_CHAR (charset, (unsigned )c);
9307   if (c < 0)
9308     error ("Invalid code: %d", code);
9309   return make_number (c);
9310 }
9311
9312 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9313        doc: /* Encode the Big5 character CH to BIG5 coding system.
9314 Return the corresponding character code in Big5.  */)
9315      (ch)
9316      Lisp_Object ch;
9317 {
9318   Lisp_Object spec, attrs, charset_list;
9319   struct charset *charset;
9320   int c;
9321   unsigned code;
9322
9323   CHECK_CHARACTER (ch);
9324   c = XFASTINT (ch);
9325   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9326   attrs = AREF (spec, 0);
9327   if (ASCII_CHAR_P (c)
9328       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9329     return ch;
9330
9331   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9332   charset = char_charset (c, charset_list, &code);
9333   if (code == CHARSET_INVALID_CODE (charset))
9334     error ("Can't encode by Big5 encoding: %d", c);
9335
9336   return make_number (code);
9337 }
9338
9339 \f
9340 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9341        Sset_terminal_coding_system_internal, 1, 2, 0,
9342        doc: /* Internal use only.  */)
9343      (coding_system, terminal)
9344      Lisp_Object coding_system;
9345      Lisp_Object terminal;
9346 {
9347   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9348   CHECK_SYMBOL (coding_system);
9349   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9350   /* We had better not send unsafe characters to terminal.  */
9351   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9352   /* Characer composition should be disabled.  */
9353   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9354   terminal_coding->src_multibyte = 1;
9355   terminal_coding->dst_multibyte = 0;
9356   return Qnil;
9357 }
9358
9359 DEFUN ("set-safe-terminal-coding-system-internal",
9360        Fset_safe_terminal_coding_system_internal,
9361        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9362        doc: /* Internal use only.  */)
9363      (coding_system)
9364      Lisp_Object coding_system;
9365 {
9366   CHECK_SYMBOL (coding_system);
9367   setup_coding_system (Fcheck_coding_system (coding_system),
9368                        &safe_terminal_coding);
9369   /* Characer composition should be disabled.  */
9370   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9371   safe_terminal_coding.src_multibyte = 1;
9372   safe_terminal_coding.dst_multibyte = 0;
9373   return Qnil;
9374 }
9375
9376 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9377        Sterminal_coding_system, 0, 1, 0,
9378        doc: /* Return coding system specified for terminal output on the given terminal.
9379 TERMINAL may be a terminal object, a frame, or nil for the selected
9380 frame's terminal device.  */)
9381      (terminal)
9382      Lisp_Object terminal;
9383 {
9384   struct coding_system *terminal_coding
9385     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9386   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9387
9388   /* For backward compatibility, return nil if it is `undecided'. */
9389   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9390 }
9391
9392 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9393        Sset_keyboard_coding_system_internal, 1, 2, 0,
9394        doc: /* Internal use only.  */)
9395      (coding_system, terminal)
9396      Lisp_Object coding_system;
9397      Lisp_Object terminal;
9398 {
9399   struct terminal *t = get_terminal (terminal, 1);
9400   CHECK_SYMBOL (coding_system);
9401   if (NILP (coding_system))
9402     coding_system = Qno_conversion;
9403   else
9404     Fcheck_coding_system (coding_system);
9405   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9406   /* Characer composition should be disabled.  */
9407   TERMINAL_KEYBOARD_CODING (t)->common_flags
9408     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9409   return Qnil;
9410 }
9411
9412 DEFUN ("keyboard-coding-system",
9413        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9414        doc: /* Return coding system specified for decoding keyboard input.  */)
9415      (terminal)
9416      Lisp_Object terminal;
9417 {
9418   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9419                          (get_terminal (terminal, 1))->id);
9420 }
9421
9422 \f
9423 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9424        Sfind_operation_coding_system,  1, MANY, 0,
9425        doc: /* Choose a coding system for an operation based on the target name.
9426 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9427 DECODING-SYSTEM is the coding system to use for decoding
9428 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9429 for encoding (in case OPERATION does encoding).
9430
9431 The first argument OPERATION specifies an I/O primitive:
9432   For file I/O, `insert-file-contents' or `write-region'.
9433   For process I/O, `call-process', `call-process-region', or `start-process'.
9434   For network I/O, `open-network-stream'.
9435
9436 The remaining arguments should be the same arguments that were passed
9437 to the primitive.  Depending on which primitive, one of those arguments
9438 is selected as the TARGET.  For example, if OPERATION does file I/O,
9439 whichever argument specifies the file name is TARGET.
9440
9441 TARGET has a meaning which depends on OPERATION:
9442   For file I/O, TARGET is a file name (except for the special case below).
9443   For process I/O, TARGET is a process name.
9444   For network I/O, TARGET is a service name or a port number.
9445
9446 This function looks up what is specified for TARGET in
9447 `file-coding-system-alist', `process-coding-system-alist',
9448 or `network-coding-system-alist' depending on OPERATION.
9449 They may specify a coding system, a cons of coding systems,
9450 or a function symbol to call.
9451 In the last case, we call the function with one argument,
9452 which is a list of all the arguments given to this function.
9453 If the function can't decide a coding system, it can return
9454 `undecided' so that the normal code-detection is performed.
9455
9456 If OPERATION is `insert-file-contents', the argument corresponding to
9457 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9458 file name to look up, and BUFFER is a buffer that contains the file's
9459 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9460 function to call for FILENAME, that function should examine the
9461 contents of BUFFER instead of reading the file.
9462
9463 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9464      (nargs, args)
9465      int nargs;
9466      Lisp_Object *args;
9467 {
9468   Lisp_Object operation, target_idx, target, val;
9469   register Lisp_Object chain;
9470
9471   if (nargs < 2)
9472     error ("Too few arguments");
9473   operation = args[0];
9474   if (!SYMBOLP (operation)
9475       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9476     error ("Invalid first argument");
9477   if (nargs < 1 + XINT (target_idx))
9478     error ("Too few arguments for operation: %s",
9479            SDATA (SYMBOL_NAME (operation)));
9480   target = args[XINT (target_idx) + 1];
9481   if (!(STRINGP (target)
9482         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9483             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9484         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9485     error ("Invalid %dth argument", XINT (target_idx) + 1);
9486   if (CONSP (target))
9487     target = XCAR (target);
9488
9489   chain = ((EQ (operation, Qinsert_file_contents)
9490             || EQ (operation, Qwrite_region))
9491            ? Vfile_coding_system_alist
9492            : (EQ (operation, Qopen_network_stream)
9493               ? Vnetwork_coding_system_alist
9494               : Vprocess_coding_system_alist));
9495   if (NILP (chain))
9496     return Qnil;
9497
9498   for (; CONSP (chain); chain = XCDR (chain))
9499     {
9500       Lisp_Object elt;
9501
9502       elt = XCAR (chain);
9503       if (CONSP (elt)
9504           && ((STRINGP (target)
9505                && STRINGP (XCAR (elt))
9506                && fast_string_match (XCAR (elt), target) >= 0)
9507               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9508         {
9509           val = XCDR (elt);
9510           /* Here, if VAL is both a valid coding system and a valid
9511              function symbol, we return VAL as a coding system.  */
9512           if (CONSP (val))
9513             return val;
9514           if (! SYMBOLP (val))
9515             return Qnil;
9516           if (! NILP (Fcoding_system_p (val)))
9517             return Fcons (val, val);
9518           if (! NILP (Ffboundp (val)))
9519             {
9520               /* We use call1 rather than safe_call1
9521                  so as to get bug reports about functions called here
9522                  which don't handle the current interface.  */
9523               val = call1 (val, Flist (nargs, args));
9524               if (CONSP (val))
9525                 return val;
9526               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9527                 return Fcons (val, val);
9528             }
9529           return Qnil;
9530         }
9531     }
9532   return Qnil;
9533 }
9534
9535 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9536        Sset_coding_system_priority, 0, MANY, 0,
9537        doc: /* Assign higher priority to the coding systems given as arguments.
9538 If multiple coding systems belong to the same category,
9539 all but the first one are ignored.
9540
9541 usage: (set-coding-system-priority &rest coding-systems)  */)
9542      (nargs, args)
9543      int nargs;
9544      Lisp_Object *args;
9545 {
9546   int i, j;
9547   int changed[coding_category_max];
9548   enum coding_category priorities[coding_category_max];
9549
9550   bzero (changed, sizeof changed);
9551
9552   for (i = j = 0; i < nargs; i++)
9553     {
9554       enum coding_category category;
9555       Lisp_Object spec, attrs;
9556
9557       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9558       attrs = AREF (spec, 0);
9559       category = XINT (CODING_ATTR_CATEGORY (attrs));
9560       if (changed[category])
9561         /* Ignore this coding system because a coding system of the
9562            same category already had a higher priority.  */
9563         continue;
9564       changed[category] = 1;
9565       priorities[j++] = category;
9566       if (coding_categories[category].id >= 0
9567           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9568         setup_coding_system (args[i], &coding_categories[category]);
9569       Fset (AREF (Vcoding_category_table, category), args[i]);
9570     }
9571
9572   /* Now we have decided top J priorities.  Reflect the order of the
9573      original priorities to the remaining priorities.  */
9574
9575   for (i = j, j = 0; i < coding_category_max; i++, j++)
9576     {
9577       while (j < coding_category_max
9578              && changed[coding_priorities[j]])
9579         j++;
9580       if (j == coding_category_max)
9581         abort ();
9582       priorities[i] = coding_priorities[j];
9583     }
9584
9585   bcopy (priorities, coding_priorities, sizeof priorities);
9586
9587   /* Update `coding-category-list'.  */
9588   Vcoding_category_list = Qnil;
9589   for (i = coding_category_max - 1; i >= 0; i--)
9590     Vcoding_category_list
9591       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9592                Vcoding_category_list);
9593
9594   return Qnil;
9595 }
9596
9597 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9598        Scoding_system_priority_list, 0, 1, 0,
9599        doc: /* Return a list of coding systems ordered by their priorities.
9600 The list contains a subset of coding systems; i.e. coding systems
9601 assigned to each coding category (see `coding-category-list').
9602
9603 HIGHESTP non-nil means just return the highest priority one.  */)
9604      (highestp)
9605      Lisp_Object highestp;
9606 {
9607   int i;
9608   Lisp_Object val;
9609
9610   for (i = 0, val = Qnil; i < coding_category_max; i++)
9611     {
9612       enum coding_category category = coding_priorities[i];
9613       int id = coding_categories[category].id;
9614       Lisp_Object attrs;
9615
9616       if (id < 0)
9617         continue;
9618       attrs = CODING_ID_ATTRS (id);
9619       if (! NILP (highestp))
9620         return CODING_ATTR_BASE_NAME (attrs);
9621       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9622     }
9623   return Fnreverse (val);
9624 }
9625
9626 static const char *const suffixes[] = { "-unix", "-dos", "-mac" };
9627
9628 static Lisp_Object
9629 make_subsidiaries (base)
9630      Lisp_Object base;
9631 {
9632   Lisp_Object subsidiaries;
9633   int base_name_len = SBYTES (SYMBOL_NAME (base));
9634   char *buf = (char *) alloca (base_name_len + 6);
9635   int i;
9636
9637   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9638   subsidiaries = Fmake_vector (make_number (3), Qnil);
9639   for (i = 0; i < 3; i++)
9640     {
9641       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9642       ASET (subsidiaries, i, intern (buf));
9643     }
9644   return subsidiaries;
9645 }
9646
9647
9648 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9649        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9650        doc: /* For internal use only.
9651 usage: (define-coding-system-internal ...)  */)
9652      (nargs, args)
9653      int nargs;
9654      Lisp_Object *args;
9655 {
9656   Lisp_Object name;
9657   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9658   Lisp_Object attrs;            /* Vector of attributes.  */
9659   Lisp_Object eol_type;
9660   Lisp_Object aliases;
9661   Lisp_Object coding_type, charset_list, safe_charsets;
9662   enum coding_category category;
9663   Lisp_Object tail, val;
9664   int max_charset_id = 0;
9665   int i;
9666
9667   if (nargs < coding_arg_max)
9668     goto short_args;
9669
9670   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9671
9672   name = args[coding_arg_name];
9673   CHECK_SYMBOL (name);
9674   CODING_ATTR_BASE_NAME (attrs) = name;
9675
9676   val = args[coding_arg_mnemonic];
9677   if (! STRINGP (val))
9678     CHECK_CHARACTER (val);
9679   CODING_ATTR_MNEMONIC (attrs) = val;
9680
9681   coding_type = args[coding_arg_coding_type];
9682   CHECK_SYMBOL (coding_type);
9683   CODING_ATTR_TYPE (attrs) = coding_type;
9684
9685   charset_list = args[coding_arg_charset_list];
9686   if (SYMBOLP (charset_list))
9687     {
9688       if (EQ (charset_list, Qiso_2022))
9689         {
9690           if (! EQ (coding_type, Qiso_2022))
9691             error ("Invalid charset-list");
9692           charset_list = Viso_2022_charset_list;
9693         }
9694       else if (EQ (charset_list, Qemacs_mule))
9695         {
9696           if (! EQ (coding_type, Qemacs_mule))
9697             error ("Invalid charset-list");
9698           charset_list = Vemacs_mule_charset_list;
9699         }
9700       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9701         if (max_charset_id < XFASTINT (XCAR (tail)))
9702           max_charset_id = XFASTINT (XCAR (tail));
9703     }
9704   else
9705     {
9706       charset_list = Fcopy_sequence (charset_list);
9707       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9708         {
9709           struct charset *charset;
9710
9711           val = XCAR (tail);
9712           CHECK_CHARSET_GET_CHARSET (val, charset);
9713           if (EQ (coding_type, Qiso_2022)
9714               ? CHARSET_ISO_FINAL (charset) < 0
9715               : EQ (coding_type, Qemacs_mule)
9716               ? CHARSET_EMACS_MULE_ID (charset) < 0
9717               : 0)
9718             error ("Can't handle charset `%s'",
9719                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9720
9721           XSETCAR (tail, make_number (charset->id));
9722           if (max_charset_id < charset->id)
9723             max_charset_id = charset->id;
9724         }
9725     }
9726   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9727
9728   safe_charsets = make_uninit_string (max_charset_id + 1);
9729   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9730   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9731     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9732   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9733
9734   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9735
9736   val = args[coding_arg_decode_translation_table];
9737   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9738     CHECK_SYMBOL (val);
9739   CODING_ATTR_DECODE_TBL (attrs) = val;
9740
9741   val = args[coding_arg_encode_translation_table];
9742   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9743     CHECK_SYMBOL (val);
9744   CODING_ATTR_ENCODE_TBL (attrs) = val;
9745
9746   val = args[coding_arg_post_read_conversion];
9747   CHECK_SYMBOL (val);
9748   CODING_ATTR_POST_READ (attrs) = val;
9749
9750   val = args[coding_arg_pre_write_conversion];
9751   CHECK_SYMBOL (val);
9752   CODING_ATTR_PRE_WRITE (attrs) = val;
9753
9754   val = args[coding_arg_default_char];
9755   if (NILP (val))
9756     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9757   else
9758     {
9759       CHECK_CHARACTER (val);
9760       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9761     }
9762
9763   val = args[coding_arg_for_unibyte];
9764   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9765
9766   val = args[coding_arg_plist];
9767   CHECK_LIST (val);
9768   CODING_ATTR_PLIST (attrs) = val;
9769
9770   if (EQ (coding_type, Qcharset))
9771     {
9772       /* Generate a lisp vector of 256 elements.  Each element is nil,
9773          integer, or a list of charset IDs.
9774
9775          If Nth element is nil, the byte code N is invalid in this
9776          coding system.
9777
9778          If Nth element is a number NUM, N is the first byte of a
9779          charset whose ID is NUM.
9780
9781          If Nth element is a list of charset IDs, N is the first byte
9782          of one of them.  The list is sorted by dimensions of the
9783          charsets.  A charset of smaller dimension comes firtst. */
9784       val = Fmake_vector (make_number (256), Qnil);
9785
9786       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9787         {
9788           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9789           int dim = CHARSET_DIMENSION (charset);
9790           int idx = (dim - 1) * 4;
9791
9792           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9793             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9794
9795           for (i = charset->code_space[idx];
9796                i <= charset->code_space[idx + 1]; i++)
9797             {
9798               Lisp_Object tmp, tmp2;
9799               int dim2;
9800
9801               tmp = AREF (val, i);
9802               if (NILP (tmp))
9803                 tmp = XCAR (tail);
9804               else if (NUMBERP (tmp))
9805                 {
9806                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9807                   if (dim < dim2)
9808                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9809                   else
9810                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9811                 }
9812               else
9813                 {
9814                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9815                     {
9816                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9817                       if (dim < dim2)
9818                         break;
9819                     }
9820                   if (NILP (tmp2))
9821                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9822                   else
9823                     {
9824                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9825                       XSETCAR (tmp2, XCAR (tail));
9826                     }
9827                 }
9828               ASET (val, i, tmp);
9829             }
9830         }
9831       ASET (attrs, coding_attr_charset_valids, val);
9832       category = coding_category_charset;
9833     }
9834   else if (EQ (coding_type, Qccl))
9835     {
9836       Lisp_Object valids;
9837
9838       if (nargs < coding_arg_ccl_max)
9839         goto short_args;
9840
9841       val = args[coding_arg_ccl_decoder];
9842       CHECK_CCL_PROGRAM (val);
9843       if (VECTORP (val))
9844         val = Fcopy_sequence (val);
9845       ASET (attrs, coding_attr_ccl_decoder, val);
9846
9847       val = args[coding_arg_ccl_encoder];
9848       CHECK_CCL_PROGRAM (val);
9849       if (VECTORP (val))
9850         val = Fcopy_sequence (val);
9851       ASET (attrs, coding_attr_ccl_encoder, val);
9852
9853       val = args[coding_arg_ccl_valids];
9854       valids = Fmake_string (make_number (256), make_number (0));
9855       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9856         {
9857           int from, to;
9858
9859           val = Fcar (tail);
9860           if (INTEGERP (val))
9861             {
9862               from = to = XINT (val);
9863               if (from < 0 || from > 255)
9864                 args_out_of_range_3 (val, make_number (0), make_number (255));
9865             }
9866           else
9867             {
9868               CHECK_CONS (val);
9869               CHECK_NATNUM_CAR (val);
9870               CHECK_NATNUM_CDR (val);
9871               from = XINT (XCAR (val));
9872               if (from > 255)
9873                 args_out_of_range_3 (XCAR (val),
9874                                      make_number (0), make_number (255));
9875               to = XINT (XCDR (val));
9876               if (to < from || to > 255)
9877                 args_out_of_range_3 (XCDR (val),
9878                                      XCAR (val), make_number (255));
9879             }
9880           for (i = from; i <= to; i++)
9881             SSET (valids, i, 1);
9882         }
9883       ASET (attrs, coding_attr_ccl_valids, valids);
9884
9885       category = coding_category_ccl;
9886     }
9887   else if (EQ (coding_type, Qutf_16))
9888     {
9889       Lisp_Object bom, endian;
9890
9891       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9892
9893       if (nargs < coding_arg_utf16_max)
9894         goto short_args;
9895
9896       bom = args[coding_arg_utf16_bom];
9897       if (! NILP (bom) && ! EQ (bom, Qt))
9898         {
9899           CHECK_CONS (bom);
9900           val = XCAR (bom);
9901           CHECK_CODING_SYSTEM (val);
9902           val = XCDR (bom);
9903           CHECK_CODING_SYSTEM (val);
9904         }
9905       ASET (attrs, coding_attr_utf_bom, bom);
9906
9907       endian = args[coding_arg_utf16_endian];
9908       CHECK_SYMBOL (endian);
9909       if (NILP (endian))
9910         endian = Qbig;
9911       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9912         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9913       ASET (attrs, coding_attr_utf_16_endian, endian);
9914
9915       category = (CONSP (bom)
9916                   ? coding_category_utf_16_auto
9917                   : NILP (bom)
9918                   ? (EQ (endian, Qbig)
9919                      ? coding_category_utf_16_be_nosig
9920                      : coding_category_utf_16_le_nosig)
9921                   : (EQ (endian, Qbig)
9922                      ? coding_category_utf_16_be
9923                      : coding_category_utf_16_le));
9924     }
9925   else if (EQ (coding_type, Qiso_2022))
9926     {
9927       Lisp_Object initial, reg_usage, request, flags;
9928       int i;
9929
9930       if (nargs < coding_arg_iso2022_max)
9931         goto short_args;
9932
9933       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9934       CHECK_VECTOR (initial);
9935       for (i = 0; i < 4; i++)
9936         {
9937           val = Faref (initial, make_number (i));
9938           if (! NILP (val))
9939             {
9940               struct charset *charset;
9941
9942               CHECK_CHARSET_GET_CHARSET (val, charset);
9943               ASET (initial, i, make_number (CHARSET_ID (charset)));
9944               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9945                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9946             }
9947           else
9948             ASET (initial, i, make_number (-1));
9949         }
9950
9951       reg_usage = args[coding_arg_iso2022_reg_usage];
9952       CHECK_CONS (reg_usage);
9953       CHECK_NUMBER_CAR (reg_usage);
9954       CHECK_NUMBER_CDR (reg_usage);
9955
9956       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9957       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9958         {
9959           int id;
9960           Lisp_Object tmp;
9961
9962           val = Fcar (tail);
9963           CHECK_CONS (val);
9964           tmp = XCAR (val);
9965           CHECK_CHARSET_GET_ID (tmp, id);
9966           CHECK_NATNUM_CDR (val);
9967           if (XINT (XCDR (val)) >= 4)
9968             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9969           XSETCAR (val, make_number (id));
9970         }
9971
9972       flags = args[coding_arg_iso2022_flags];
9973       CHECK_NATNUM (flags);
9974       i = XINT (flags);
9975       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9976         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9977
9978       ASET (attrs, coding_attr_iso_initial, initial);
9979       ASET (attrs, coding_attr_iso_usage, reg_usage);
9980       ASET (attrs, coding_attr_iso_request, request);
9981       ASET (attrs, coding_attr_iso_flags, flags);
9982       setup_iso_safe_charsets (attrs);
9983
9984       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9985         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9986                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9987                     ? coding_category_iso_7_else
9988                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9989                     ? coding_category_iso_7
9990                     : coding_category_iso_7_tight);
9991       else
9992         {
9993           int id = XINT (AREF (initial, 1));
9994
9995           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9996                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9997                        || id < 0)
9998                       ? coding_category_iso_8_else
9999                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
10000                       ? coding_category_iso_8_1
10001                       : coding_category_iso_8_2);
10002         }
10003       if (category != coding_category_iso_8_1
10004           && category != coding_category_iso_8_2)
10005         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10006     }
10007   else if (EQ (coding_type, Qemacs_mule))
10008     {
10009       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10010         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10011       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10012       category = coding_category_emacs_mule;
10013     }
10014   else if (EQ (coding_type, Qshift_jis))
10015     {
10016
10017       struct charset *charset;
10018
10019       if (XINT (Flength (charset_list)) != 3
10020           && XINT (Flength (charset_list)) != 4)
10021         error ("There should be three or four charsets");
10022
10023       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10024       if (CHARSET_DIMENSION (charset) != 1)
10025         error ("Dimension of charset %s is not one",
10026                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10027       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10028         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10029
10030       charset_list = XCDR (charset_list);
10031       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10032       if (CHARSET_DIMENSION (charset) != 1)
10033         error ("Dimension of charset %s is not one",
10034                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10035
10036       charset_list = XCDR (charset_list);
10037       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10038       if (CHARSET_DIMENSION (charset) != 2)
10039         error ("Dimension of charset %s is not two",
10040                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10041
10042       charset_list = XCDR (charset_list);
10043       if (! NILP (charset_list))
10044         {
10045           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10046           if (CHARSET_DIMENSION (charset) != 2)
10047             error ("Dimension of charset %s is not two",
10048                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10049         }
10050
10051       category = coding_category_sjis;
10052       Vsjis_coding_system = name;
10053     }
10054   else if (EQ (coding_type, Qbig5))
10055     {
10056       struct charset *charset;
10057
10058       if (XINT (Flength (charset_list)) != 2)
10059         error ("There should be just two charsets");
10060
10061       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10062       if (CHARSET_DIMENSION (charset) != 1)
10063         error ("Dimension of charset %s is not one",
10064                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10065       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10066         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10067
10068       charset_list = XCDR (charset_list);
10069       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10070       if (CHARSET_DIMENSION (charset) != 2)
10071         error ("Dimension of charset %s is not two",
10072                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10073
10074       category = coding_category_big5;
10075       Vbig5_coding_system = name;
10076     }
10077   else if (EQ (coding_type, Qraw_text))
10078     {
10079       category = coding_category_raw_text;
10080       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10081     }
10082   else if (EQ (coding_type, Qutf_8))
10083     {
10084       Lisp_Object bom;
10085
10086       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10087
10088       if (nargs < coding_arg_utf8_max)
10089         goto short_args;
10090
10091       bom = args[coding_arg_utf8_bom];
10092       if (! NILP (bom) && ! EQ (bom, Qt))
10093         {
10094           CHECK_CONS (bom);
10095           val = XCAR (bom);
10096           CHECK_CODING_SYSTEM (val);
10097           val = XCDR (bom);
10098           CHECK_CODING_SYSTEM (val);
10099         }
10100       ASET (attrs, coding_attr_utf_bom, bom);
10101
10102       category = (CONSP (bom) ? coding_category_utf_8_auto
10103                   : NILP (bom) ? coding_category_utf_8_nosig
10104                   : coding_category_utf_8_sig);
10105     }
10106   else if (EQ (coding_type, Qundecided))
10107     category = coding_category_undecided;
10108   else
10109     error ("Invalid coding system type: %s",
10110            SDATA (SYMBOL_NAME (coding_type)));
10111
10112   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10113   CODING_ATTR_PLIST (attrs)
10114     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10115                                 CODING_ATTR_PLIST (attrs)));
10116   CODING_ATTR_PLIST (attrs)
10117     = Fcons (QCascii_compatible_p,
10118              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10119                     CODING_ATTR_PLIST (attrs)));
10120
10121   eol_type = args[coding_arg_eol_type];
10122   if (! NILP (eol_type)
10123       && ! EQ (eol_type, Qunix)
10124       && ! EQ (eol_type, Qdos)
10125       && ! EQ (eol_type, Qmac))
10126     error ("Invalid eol-type");
10127
10128   aliases = Fcons (name, Qnil);
10129
10130   if (NILP (eol_type))
10131     {
10132       eol_type = make_subsidiaries (name);
10133       for (i = 0; i < 3; i++)
10134         {
10135           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10136
10137           this_name = AREF (eol_type, i);
10138           this_aliases = Fcons (this_name, Qnil);
10139           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10140           this_spec = Fmake_vector (make_number (3), attrs);
10141           ASET (this_spec, 1, this_aliases);
10142           ASET (this_spec, 2, this_eol_type);
10143           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10144           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10145           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10146           if (NILP (val))
10147             Vcoding_system_alist
10148               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10149                        Vcoding_system_alist);
10150         }
10151     }
10152
10153   spec_vec = Fmake_vector (make_number (3), attrs);
10154   ASET (spec_vec, 1, aliases);
10155   ASET (spec_vec, 2, eol_type);
10156
10157   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10158   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10159   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10160   if (NILP (val))
10161     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10162                                   Vcoding_system_alist);
10163
10164   {
10165     int id = coding_categories[category].id;
10166
10167     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10168       setup_coding_system (name, &coding_categories[category]);
10169   }
10170
10171   return Qnil;
10172
10173  short_args:
10174   return Fsignal (Qwrong_number_of_arguments,
10175                   Fcons (intern ("define-coding-system-internal"),
10176                          make_number (nargs)));
10177 }
10178
10179
10180 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10181        3, 3, 0,
10182        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10183   (coding_system, prop, val)
10184      Lisp_Object coding_system, prop, val;
10185 {
10186   Lisp_Object spec, attrs;
10187
10188   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10189   attrs = AREF (spec, 0);
10190   if (EQ (prop, QCmnemonic))
10191     {
10192       if (! STRINGP (val))
10193         CHECK_CHARACTER (val);
10194       CODING_ATTR_MNEMONIC (attrs) = val;
10195     }
10196   else if (EQ (prop, QCdefault_char))
10197     {
10198       if (NILP (val))
10199         val = make_number (' ');
10200       else
10201         CHECK_CHARACTER (val);
10202       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10203     }
10204   else if (EQ (prop, QCdecode_translation_table))
10205     {
10206       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10207         CHECK_SYMBOL (val);
10208       CODING_ATTR_DECODE_TBL (attrs) = val;
10209     }
10210   else if (EQ (prop, QCencode_translation_table))
10211     {
10212       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10213         CHECK_SYMBOL (val);
10214       CODING_ATTR_ENCODE_TBL (attrs) = val;
10215     }
10216   else if (EQ (prop, QCpost_read_conversion))
10217     {
10218       CHECK_SYMBOL (val);
10219       CODING_ATTR_POST_READ (attrs) = val;
10220     }
10221   else if (EQ (prop, QCpre_write_conversion))
10222     {
10223       CHECK_SYMBOL (val);
10224       CODING_ATTR_PRE_WRITE (attrs) = val;
10225     }
10226   else if (EQ (prop, QCascii_compatible_p))
10227     {
10228       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10229     }
10230
10231   CODING_ATTR_PLIST (attrs)
10232     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10233   return val;
10234 }
10235
10236
10237 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10238        Sdefine_coding_system_alias, 2, 2, 0,
10239        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10240      (alias, coding_system)
10241      Lisp_Object alias, coding_system;
10242 {
10243   Lisp_Object spec, aliases, eol_type, val;
10244
10245   CHECK_SYMBOL (alias);
10246   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10247   aliases = AREF (spec, 1);
10248   /* ALIASES should be a list of length more than zero, and the first
10249      element is a base coding system.  Append ALIAS at the tail of the
10250      list.  */
10251   while (!NILP (XCDR (aliases)))
10252     aliases = XCDR (aliases);
10253   XSETCDR (aliases, Fcons (alias, Qnil));
10254
10255   eol_type = AREF (spec, 2);
10256   if (VECTORP (eol_type))
10257     {
10258       Lisp_Object subsidiaries;
10259       int i;
10260
10261       subsidiaries = make_subsidiaries (alias);
10262       for (i = 0; i < 3; i++)
10263         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10264                                      AREF (eol_type, i));
10265     }
10266
10267   Fputhash (alias, spec, Vcoding_system_hash_table);
10268   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10269   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10270   if (NILP (val))
10271     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10272                                   Vcoding_system_alist);
10273
10274   return Qnil;
10275 }
10276
10277 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10278        1, 1, 0,
10279        doc: /* Return the base of CODING-SYSTEM.
10280 Any alias or subsidiary coding system is not a base coding system.  */)
10281   (coding_system)
10282      Lisp_Object coding_system;
10283 {
10284   Lisp_Object spec, attrs;
10285
10286   if (NILP (coding_system))
10287     return (Qno_conversion);
10288   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10289   attrs = AREF (spec, 0);
10290   return CODING_ATTR_BASE_NAME (attrs);
10291 }
10292
10293 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10294        1, 1, 0,
10295        doc: "Return the property list of CODING-SYSTEM.")
10296      (coding_system)
10297      Lisp_Object coding_system;
10298 {
10299   Lisp_Object spec, attrs;
10300
10301   if (NILP (coding_system))
10302     coding_system = Qno_conversion;
10303   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10304   attrs = AREF (spec, 0);
10305   return CODING_ATTR_PLIST (attrs);
10306 }
10307
10308
10309 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10310        1, 1, 0,
10311        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10312      (coding_system)
10313      Lisp_Object coding_system;
10314 {
10315   Lisp_Object spec;
10316
10317   if (NILP (coding_system))
10318     coding_system = Qno_conversion;
10319   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10320   return AREF (spec, 1);
10321 }
10322
10323 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10324        Scoding_system_eol_type, 1, 1, 0,
10325        doc: /* Return eol-type of CODING-SYSTEM.
10326 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10327
10328 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10329 and CR respectively.
10330
10331 A vector value indicates that a format of end-of-line should be
10332 detected automatically.  Nth element of the vector is the subsidiary
10333 coding system whose eol-type is N.  */)
10334      (coding_system)
10335      Lisp_Object coding_system;
10336 {
10337   Lisp_Object spec, eol_type;
10338   int n;
10339
10340   if (NILP (coding_system))
10341     coding_system = Qno_conversion;
10342   if (! CODING_SYSTEM_P (coding_system))
10343     return Qnil;
10344   spec = CODING_SYSTEM_SPEC (coding_system);
10345   eol_type = AREF (spec, 2);
10346   if (VECTORP (eol_type))
10347     return Fcopy_sequence (eol_type);
10348   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10349   return make_number (n);
10350 }
10351
10352 #endif /* emacs */
10353
10354 \f
10355 /*** 9. Post-amble ***/
10356
10357 void
10358 init_coding_once ()
10359 {
10360   int i;
10361
10362   for (i = 0; i < coding_category_max; i++)
10363     {
10364       coding_categories[i].id = -1;
10365       coding_priorities[i] = i;
10366     }
10367
10368   /* ISO2022 specific initialize routine.  */
10369   for (i = 0; i < 0x20; i++)
10370     iso_code_class[i] = ISO_control_0;
10371   for (i = 0x21; i < 0x7F; i++)
10372     iso_code_class[i] = ISO_graphic_plane_0;
10373   for (i = 0x80; i < 0xA0; i++)
10374     iso_code_class[i] = ISO_control_1;
10375   for (i = 0xA1; i < 0xFF; i++)
10376     iso_code_class[i] = ISO_graphic_plane_1;
10377   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10378   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10379   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10380   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10381   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10382   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10383   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10384   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10385   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10386
10387   for (i = 0; i < 256; i++)
10388     {
10389       emacs_mule_bytes[i] = 1;
10390     }
10391   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10392   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10393   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10394   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10395 }
10396
10397 #ifdef emacs
10398
10399 void
10400 syms_of_coding ()
10401 {
10402   staticpro (&Vcoding_system_hash_table);
10403   {
10404     Lisp_Object args[2];
10405     args[0] = QCtest;
10406     args[1] = Qeq;
10407     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10408   }
10409
10410   staticpro (&Vsjis_coding_system);
10411   Vsjis_coding_system = Qnil;
10412
10413   staticpro (&Vbig5_coding_system);
10414   Vbig5_coding_system = Qnil;
10415
10416   staticpro (&Vcode_conversion_reused_workbuf);
10417   Vcode_conversion_reused_workbuf = Qnil;
10418
10419   staticpro (&Vcode_conversion_workbuf_name);
10420   Vcode_conversion_workbuf_name = make_pure_c_string (" *code-conversion-work*");
10421
10422   reused_workbuf_in_use = 0;
10423
10424   DEFSYM (Qcharset, "charset");
10425   DEFSYM (Qtarget_idx, "target-idx");
10426   DEFSYM (Qcoding_system_history, "coding-system-history");
10427   Fset (Qcoding_system_history, Qnil);
10428
10429   /* Target FILENAME is the first argument.  */
10430   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10431   /* Target FILENAME is the third argument.  */
10432   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10433
10434   DEFSYM (Qcall_process, "call-process");
10435   /* Target PROGRAM is the first argument.  */
10436   Fput (Qcall_process, Qtarget_idx, make_number (0));
10437
10438   DEFSYM (Qcall_process_region, "call-process-region");
10439   /* Target PROGRAM is the third argument.  */
10440   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10441
10442   DEFSYM (Qstart_process, "start-process");
10443   /* Target PROGRAM is the third argument.  */
10444   Fput (Qstart_process, Qtarget_idx, make_number (2));
10445
10446   DEFSYM (Qopen_network_stream, "open-network-stream");
10447   /* Target SERVICE is the fourth argument.  */
10448   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10449
10450   DEFSYM (Qcoding_system, "coding-system");
10451   DEFSYM (Qcoding_aliases, "coding-aliases");
10452
10453   DEFSYM (Qeol_type, "eol-type");
10454   DEFSYM (Qunix, "unix");
10455   DEFSYM (Qdos, "dos");
10456
10457   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10458   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10459   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10460   DEFSYM (Qdefault_char, "default-char");
10461   DEFSYM (Qundecided, "undecided");
10462   DEFSYM (Qno_conversion, "no-conversion");
10463   DEFSYM (Qraw_text, "raw-text");
10464
10465   DEFSYM (Qiso_2022, "iso-2022");
10466
10467   DEFSYM (Qutf_8, "utf-8");
10468   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10469
10470   DEFSYM (Qutf_16, "utf-16");
10471   DEFSYM (Qbig, "big");
10472   DEFSYM (Qlittle, "little");
10473
10474   DEFSYM (Qshift_jis, "shift-jis");
10475   DEFSYM (Qbig5, "big5");
10476
10477   DEFSYM (Qcoding_system_p, "coding-system-p");
10478
10479   DEFSYM (Qcoding_system_error, "coding-system-error");
10480   Fput (Qcoding_system_error, Qerror_conditions,
10481         pure_cons (Qcoding_system_error, pure_cons (Qerror, Qnil)));
10482   Fput (Qcoding_system_error, Qerror_message,
10483         make_pure_c_string ("Invalid coding system"));
10484
10485   /* Intern this now in case it isn't already done.
10486      Setting this variable twice is harmless.
10487      But don't staticpro it here--that is done in alloc.c.  */
10488   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
10489
10490   DEFSYM (Qtranslation_table, "translation-table");
10491   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10492   DEFSYM (Qtranslation_table_id, "translation-table-id");
10493   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10494   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10495
10496   DEFSYM (Qvalid_codes, "valid-codes");
10497
10498   DEFSYM (Qemacs_mule, "emacs-mule");
10499
10500   DEFSYM (QCcategory, ":category");
10501   DEFSYM (QCmnemonic, ":mnemonic");
10502   DEFSYM (QCdefault_char, ":default-char");
10503   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10504   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10505   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10506   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10507   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10508
10509   Vcoding_category_table
10510     = Fmake_vector (make_number (coding_category_max), Qnil);
10511   staticpro (&Vcoding_category_table);
10512   /* Followings are target of code detection.  */
10513   ASET (Vcoding_category_table, coding_category_iso_7,
10514         intern_c_string ("coding-category-iso-7"));
10515   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10516         intern_c_string ("coding-category-iso-7-tight"));
10517   ASET (Vcoding_category_table, coding_category_iso_8_1,
10518         intern_c_string ("coding-category-iso-8-1"));
10519   ASET (Vcoding_category_table, coding_category_iso_8_2,
10520         intern_c_string ("coding-category-iso-8-2"));
10521   ASET (Vcoding_category_table, coding_category_iso_7_else,
10522         intern_c_string ("coding-category-iso-7-else"));
10523   ASET (Vcoding_category_table, coding_category_iso_8_else,
10524         intern_c_string ("coding-category-iso-8-else"));
10525   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10526         intern_c_string ("coding-category-utf-8-auto"));
10527   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10528         intern_c_string ("coding-category-utf-8"));
10529   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10530         intern_c_string ("coding-category-utf-8-sig"));
10531   ASET (Vcoding_category_table, coding_category_utf_16_be,
10532         intern_c_string ("coding-category-utf-16-be"));
10533   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10534         intern_c_string ("coding-category-utf-16-auto"));
10535   ASET (Vcoding_category_table, coding_category_utf_16_le,
10536         intern_c_string ("coding-category-utf-16-le"));
10537   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10538         intern_c_string ("coding-category-utf-16-be-nosig"));
10539   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10540         intern_c_string ("coding-category-utf-16-le-nosig"));
10541   ASET (Vcoding_category_table, coding_category_charset,
10542         intern_c_string ("coding-category-charset"));
10543   ASET (Vcoding_category_table, coding_category_sjis,
10544         intern_c_string ("coding-category-sjis"));
10545   ASET (Vcoding_category_table, coding_category_big5,
10546         intern_c_string ("coding-category-big5"));
10547   ASET (Vcoding_category_table, coding_category_ccl,
10548         intern_c_string ("coding-category-ccl"));
10549   ASET (Vcoding_category_table, coding_category_emacs_mule,
10550         intern_c_string ("coding-category-emacs-mule"));
10551   /* Followings are NOT target of code detection.  */
10552   ASET (Vcoding_category_table, coding_category_raw_text,
10553         intern_c_string ("coding-category-raw-text"));
10554   ASET (Vcoding_category_table, coding_category_undecided,
10555         intern_c_string ("coding-category-undecided"));
10556
10557   DEFSYM (Qinsufficient_source, "insufficient-source");
10558   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10559   DEFSYM (Qinvalid_source, "invalid-source");
10560   DEFSYM (Qinterrupted, "interrupted");
10561   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10562   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10563
10564   defsubr (&Scoding_system_p);
10565   defsubr (&Sread_coding_system);
10566   defsubr (&Sread_non_nil_coding_system);
10567   defsubr (&Scheck_coding_system);
10568   defsubr (&Sdetect_coding_region);
10569   defsubr (&Sdetect_coding_string);
10570   defsubr (&Sfind_coding_systems_region_internal);
10571   defsubr (&Sunencodable_char_position);
10572   defsubr (&Scheck_coding_systems_region);
10573   defsubr (&Sdecode_coding_region);
10574   defsubr (&Sencode_coding_region);
10575   defsubr (&Sdecode_coding_string);
10576   defsubr (&Sencode_coding_string);
10577   defsubr (&Sdecode_sjis_char);
10578   defsubr (&Sencode_sjis_char);
10579   defsubr (&Sdecode_big5_char);
10580   defsubr (&Sencode_big5_char);
10581   defsubr (&Sset_terminal_coding_system_internal);
10582   defsubr (&Sset_safe_terminal_coding_system_internal);
10583   defsubr (&Sterminal_coding_system);
10584   defsubr (&Sset_keyboard_coding_system_internal);
10585   defsubr (&Skeyboard_coding_system);
10586   defsubr (&Sfind_operation_coding_system);
10587   defsubr (&Sset_coding_system_priority);
10588   defsubr (&Sdefine_coding_system_internal);
10589   defsubr (&Sdefine_coding_system_alias);
10590   defsubr (&Scoding_system_put);
10591   defsubr (&Scoding_system_base);
10592   defsubr (&Scoding_system_plist);
10593   defsubr (&Scoding_system_aliases);
10594   defsubr (&Scoding_system_eol_type);
10595   defsubr (&Scoding_system_priority_list);
10596
10597   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10598                doc: /* List of coding systems.
10599
10600 Do not alter the value of this variable manually.  This variable should be
10601 updated by the functions `define-coding-system' and
10602 `define-coding-system-alias'.  */);
10603   Vcoding_system_list = Qnil;
10604
10605   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10606                doc: /* Alist of coding system names.
10607 Each element is one element list of coding system name.
10608 This variable is given to `completing-read' as COLLECTION argument.
10609
10610 Do not alter the value of this variable manually.  This variable should be
10611 updated by the functions `make-coding-system' and
10612 `define-coding-system-alias'.  */);
10613   Vcoding_system_alist = Qnil;
10614
10615   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10616                doc: /* List of coding-categories (symbols) ordered by priority.
10617
10618 On detecting a coding system, Emacs tries code detection algorithms
10619 associated with each coding-category one by one in this order.  When
10620 one algorithm agrees with a byte sequence of source text, the coding
10621 system bound to the corresponding coding-category is selected.
10622
10623 Don't modify this variable directly, but use `set-coding-priority'.  */);
10624   {
10625     int i;
10626
10627     Vcoding_category_list = Qnil;
10628     for (i = coding_category_max - 1; i >= 0; i--)
10629       Vcoding_category_list
10630         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10631                  Vcoding_category_list);
10632   }
10633
10634   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10635                doc: /* Specify the coding system for read operations.
10636 It is useful to bind this variable with `let', but do not set it globally.
10637 If the value is a coding system, it is used for decoding on read operation.
10638 If not, an appropriate element is used from one of the coding system alists.
10639 There are three such tables: `file-coding-system-alist',
10640 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10641   Vcoding_system_for_read = Qnil;
10642
10643   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10644                doc: /* Specify the coding system for write operations.
10645 Programs bind this variable with `let', but you should not set it globally.
10646 If the value is a coding system, it is used for encoding of output,
10647 when writing it to a file and when sending it to a file or subprocess.
10648
10649 If this does not specify a coding system, an appropriate element
10650 is used from one of the coding system alists.
10651 There are three such tables: `file-coding-system-alist',
10652 `process-coding-system-alist', and `network-coding-system-alist'.
10653 For output to files, if the above procedure does not specify a coding system,
10654 the value of `buffer-file-coding-system' is used.  */);
10655   Vcoding_system_for_write = Qnil;
10656
10657   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10658                doc: /*
10659 Coding system used in the latest file or process I/O.  */);
10660   Vlast_coding_system_used = Qnil;
10661
10662   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10663                doc: /*
10664 Error status of the last code conversion.
10665
10666 When an error was detected in the last code conversion, this variable
10667 is set to one of the following symbols.
10668   `insufficient-source'
10669   `inconsistent-eol'
10670   `invalid-source'
10671   `interrupted'
10672   `insufficient-memory'
10673 When no error was detected, the value doesn't change.  So, to check
10674 the error status of a code conversion by this variable, you must
10675 explicitly set this variable to nil before performing code
10676 conversion.  */);
10677   Vlast_code_conversion_error = Qnil;
10678
10679   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10680                doc: /*
10681 *Non-nil means always inhibit code conversion of end-of-line format.
10682 See info node `Coding Systems' and info node `Text and Binary' concerning
10683 such conversion.  */);
10684   inhibit_eol_conversion = 0;
10685
10686   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10687                doc: /*
10688 Non-nil means process buffer inherits coding system of process output.
10689 Bind it to t if the process output is to be treated as if it were a file
10690 read from some filesystem.  */);
10691   inherit_process_coding_system = 0;
10692
10693   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10694                doc: /*
10695 Alist to decide a coding system to use for a file I/O operation.
10696 The format is ((PATTERN . VAL) ...),
10697 where PATTERN is a regular expression matching a file name,
10698 VAL is a coding system, a cons of coding systems, or a function symbol.
10699 If VAL is a coding system, it is used for both decoding and encoding
10700 the file contents.
10701 If VAL is a cons of coding systems, the car part is used for decoding,
10702 and the cdr part is used for encoding.
10703 If VAL is a function symbol, the function must return a coding system
10704 or a cons of coding systems which are used as above.  The function is
10705 called with an argument that is a list of the arguments with which
10706 `find-operation-coding-system' was called.  If the function can't decide
10707 a coding system, it can return `undecided' so that the normal
10708 code-detection is performed.
10709
10710 See also the function `find-operation-coding-system'
10711 and the variable `auto-coding-alist'.  */);
10712   Vfile_coding_system_alist = Qnil;
10713
10714   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10715                doc: /*
10716 Alist to decide a coding system to use for a process I/O operation.
10717 The format is ((PATTERN . VAL) ...),
10718 where PATTERN is a regular expression matching a program name,
10719 VAL is a coding system, a cons of coding systems, or a function symbol.
10720 If VAL is a coding system, it is used for both decoding what received
10721 from the program and encoding what sent to the program.
10722 If VAL is a cons of coding systems, the car part is used for decoding,
10723 and the cdr part is used for encoding.
10724 If VAL is a function symbol, the function must return a coding system
10725 or a cons of coding systems which are used as above.
10726
10727 See also the function `find-operation-coding-system'.  */);
10728   Vprocess_coding_system_alist = Qnil;
10729
10730   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10731                doc: /*
10732 Alist to decide a coding system to use for a network I/O operation.
10733 The format is ((PATTERN . VAL) ...),
10734 where PATTERN is a regular expression matching a network service name
10735 or is a port number to connect to,
10736 VAL is a coding system, a cons of coding systems, or a function symbol.
10737 If VAL is a coding system, it is used for both decoding what received
10738 from the network stream and encoding what sent to the network stream.
10739 If VAL is a cons of coding systems, the car part is used for decoding,
10740 and the cdr part is used for encoding.
10741 If VAL is a function symbol, the function must return a coding system
10742 or a cons of coding systems which are used as above.
10743
10744 See also the function `find-operation-coding-system'.  */);
10745   Vnetwork_coding_system_alist = Qnil;
10746
10747   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10748                doc: /* Coding system to use with system messages.
10749 Also used for decoding keyboard input on X Window system.  */);
10750   Vlocale_coding_system = Qnil;
10751
10752   /* The eol mnemonics are reset in startup.el system-dependently.  */
10753   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10754                doc: /*
10755 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10756   eol_mnemonic_unix = make_pure_c_string (":");
10757
10758   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10759                doc: /*
10760 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10761   eol_mnemonic_dos = make_pure_c_string ("\\");
10762
10763   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10764                doc: /*
10765 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10766   eol_mnemonic_mac = make_pure_c_string ("/");
10767
10768   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10769                doc: /*
10770 *String displayed in mode line when end-of-line format is not yet determined.  */);
10771   eol_mnemonic_undecided = make_pure_c_string (":");
10772
10773   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10774                doc: /*
10775 *Non-nil enables character translation while encoding and decoding.  */);
10776   Venable_character_translation = Qt;
10777
10778   DEFVAR_LISP ("standard-translation-table-for-decode",
10779                &Vstandard_translation_table_for_decode,
10780                doc: /* Table for translating characters while decoding.  */);
10781   Vstandard_translation_table_for_decode = Qnil;
10782
10783   DEFVAR_LISP ("standard-translation-table-for-encode",
10784                &Vstandard_translation_table_for_encode,
10785                doc: /* Table for translating characters while encoding.  */);
10786   Vstandard_translation_table_for_encode = Qnil;
10787
10788   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10789                doc: /* Alist of charsets vs revision numbers.
10790 While encoding, if a charset (car part of an element) is found,
10791 designate it with the escape sequence identifying revision (cdr part
10792 of the element).  */);
10793   Vcharset_revision_table = Qnil;
10794
10795   DEFVAR_LISP ("default-process-coding-system",
10796                &Vdefault_process_coding_system,
10797                doc: /* Cons of coding systems used for process I/O by default.
10798 The car part is used for decoding a process output,
10799 the cdr part is used for encoding a text to be sent to a process.  */);
10800   Vdefault_process_coding_system = Qnil;
10801
10802   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10803                doc: /*
10804 Table of extra Latin codes in the range 128..159 (inclusive).
10805 This is a vector of length 256.
10806 If Nth element is non-nil, the existence of code N in a file
10807 \(or output of subprocess) doesn't prevent it to be detected as
10808 a coding system of ISO 2022 variant which has a flag
10809 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10810 or reading output of a subprocess.
10811 Only 128th through 159th elements have a meaning.  */);
10812   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10813
10814   DEFVAR_LISP ("select-safe-coding-system-function",
10815                &Vselect_safe_coding_system_function,
10816                doc: /*
10817 Function to call to select safe coding system for encoding a text.
10818
10819 If set, this function is called to force a user to select a proper
10820 coding system which can encode the text in the case that a default
10821 coding system used in each operation can't encode the text.  The
10822 function should take care that the buffer is not modified while
10823 the coding system is being selected.
10824
10825 The default value is `select-safe-coding-system' (which see).  */);
10826   Vselect_safe_coding_system_function = Qnil;
10827
10828   DEFVAR_BOOL ("coding-system-require-warning",
10829                &coding_system_require_warning,
10830                doc: /* Internal use only.
10831 If non-nil, on writing a file, `select-safe-coding-system-function' is
10832 called even if `coding-system-for-write' is non-nil.  The command
10833 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10834   coding_system_require_warning = 0;
10835
10836
10837   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10838                &inhibit_iso_escape_detection,
10839                doc: /*
10840 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10841
10842 When Emacs reads text, it tries to detect how the text is encoded.
10843 This code detection is sensitive to escape sequences.  If Emacs sees
10844 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10845 of the ISO2022 encodings, and decodes text by the corresponding coding
10846 system (e.g. `iso-2022-7bit').
10847
10848 However, there may be a case that you want to read escape sequences in
10849 a file as is.  In such a case, you can set this variable to non-nil.
10850 Then the code detection will ignore any escape sequences, and no text is
10851 detected as encoded in some ISO-2022 encoding.  The result is that all
10852 escape sequences become visible in a buffer.
10853
10854 The default value is nil, and it is strongly recommended not to change
10855 it.  That is because many Emacs Lisp source files that contain
10856 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10857 in Emacs's distribution, and they won't be decoded correctly on
10858 reading if you suppress escape sequence detection.
10859
10860 The other way to read escape sequences in a file without decoding is
10861 to explicitly specify some coding system that doesn't use ISO-2022
10862 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10863   inhibit_iso_escape_detection = 0;
10864
10865   DEFVAR_BOOL ("inhibit-null-byte-detection",
10866                &inhibit_null_byte_detection,
10867                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10868 By default, Emacs treats it as binary data, and does not attempt to
10869 decode it.  The effect is as if you specified `no-conversion' for
10870 reading that text.
10871
10872 Set this to non-nil when a regular text happens to include null bytes.
10873 Examples are Index nodes of Info files and null-byte delimited output
10874 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10875 decode text as usual.  */);
10876   inhibit_null_byte_detection = 0;
10877
10878   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10879                doc: /* Char table for translating self-inserting characters.
10880 This is applied to the result of input methods, not their input.
10881 See also `keyboard-translate-table'.
10882
10883 Use of this variable for character code unification was rendered
10884 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10885 internal character representation.  */);
10886     Vtranslation_table_for_input = Qnil;
10887
10888   {
10889     Lisp_Object args[coding_arg_max];
10890     Lisp_Object plist[16];
10891     int i;
10892
10893     for (i = 0; i < coding_arg_max; i++)
10894       args[i] = Qnil;
10895
10896     plist[0] = intern_c_string (":name");
10897     plist[1] = args[coding_arg_name] = Qno_conversion;
10898     plist[2] = intern_c_string (":mnemonic");
10899     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10900     plist[4] = intern_c_string (":coding-type");
10901     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10902     plist[6] = intern_c_string (":ascii-compatible-p");
10903     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10904     plist[8] = intern_c_string (":default-char");
10905     plist[9] = args[coding_arg_default_char] = make_number (0);
10906     plist[10] = intern_c_string (":for-unibyte");
10907     plist[11] = args[coding_arg_for_unibyte] = Qt;
10908     plist[12] = intern_c_string (":docstring");
10909     plist[13] = make_pure_c_string ("Do no conversion.\n\
10910 \n\
10911 When you visit a file with this coding, the file is read into a\n\
10912 unibyte buffer as is, thus each byte of a file is treated as a\n\
10913 character.");
10914     plist[14] = intern_c_string (":eol-type");
10915     plist[15] = args[coding_arg_eol_type] = Qunix;
10916     args[coding_arg_plist] = Flist (16, plist);
10917     Fdefine_coding_system_internal (coding_arg_max, args);
10918
10919     plist[1] = args[coding_arg_name] = Qundecided;
10920     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10921     plist[5] = args[coding_arg_coding_type] = Qundecided;
10922     /* This is already set.
10923        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10924     plist[8] = intern_c_string (":charset-list");
10925     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10926     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10927     plist[13] = make_pure_c_string ("No conversion on encoding, automatic conversion on decoding.");
10928     plist[15] = args[coding_arg_eol_type] = Qnil;
10929     args[coding_arg_plist] = Flist (16, plist);
10930     Fdefine_coding_system_internal (coding_arg_max, args);
10931   }
10932
10933   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10934
10935   {
10936     int i;
10937
10938     for (i = 0; i < coding_category_max; i++)
10939       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10940   }
10941 #if defined (MSDOS) || defined (WINDOWSNT)
10942   system_eol_type = Qdos;
10943 #else
10944   system_eol_type = Qunix;
10945 #endif
10946   staticpro (&system_eol_type);
10947 }
10948
10949 char *
10950 emacs_strerror (error_number)
10951      int error_number;
10952 {
10953   char *str;
10954
10955   synchronize_system_messages_locale ();
10956   str = strerror (error_number);
10957
10958   if (! NILP (Vlocale_coding_system))
10959     {
10960       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10961                                                       Vlocale_coding_system,
10962                                                       0);
10963       str = (char *) SDATA (dec);
10964     }
10965
10966   return str;
10967 }
10968
10969 #endif /* emacs */
10970
10971 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10972    (do not change this comment) */