src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006, 2007, 2008, 2009 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004,
   5      2005, 2006, 2007, 2008, 2009
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H14PRO021
   8    Copyright (C) 2003
   9      National Institute of Advanced Industrial Science and Technology (AIST)
  10      Registration Number H13PRO009
  11
  12 This file is part of GNU Emacs.
  13
  14 GNU Emacs is free software: you can redistribute it and/or modify
  15 it under the terms of the GNU General Public License as published by
  16 the Free Software Foundation, either version 3 of the License, or
  17 (at your option) any later version.
  18
  19 GNU Emacs is distributed in the hope that it will be useful,
  20 but WITHOUT ANY WARRANTY; without even the implied warranty of
  21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  22 GNU General Public License for more details.
  23
  24 You should have received a copy of the GNU General Public License
  25 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292
 293 #include "lisp.h"
 294 #include "buffer.h"
 295 #include "character.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301 #include "frame.h"
 302 #include "termhooks.h"
 303
 304 Lisp_Object Vcoding_system_hash_table;
 305
 306 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 307 Lisp_Object Qunix, Qdos;
 308 extern Lisp_Object Qmac;        /* frame.c */
 309 Lisp_Object Qbuffer_file_coding_system;
 310 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 311 Lisp_Object Qdefault_char;
 312 Lisp_Object Qno_conversion, Qundecided;
 313 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 314 Lisp_Object Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317 Lisp_Object QCcategory, QCmnemonic, QCdefault_char;
 318 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 319 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 320 Lisp_Object QCascii_compatible_p;
 321
 322 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 323 Lisp_Object Qcall_process, Qcall_process_region;
 324 Lisp_Object Qstart_process, Qopen_network_stream;
 325 Lisp_Object Qtarget_idx;
 326
 327 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 328 Lisp_Object Qinterrupted, Qinsufficient_memory;
 329
 330 extern Lisp_Object Qcompletion_ignore_case;
 331
 332 /* If a symbol has this property, evaluate the value to define the
 333    symbol as a coding system.  */
 334 static Lisp_Object Qcoding_system_define_form;
 335
 336 int coding_system_require_warning;
 337
 338 Lisp_Object Vselect_safe_coding_system_function;
 339
 340 /* Mnemonic string for each format of end-of-line.  */
 341 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 342 /* Mnemonic string to indicate format of end-of-line is not yet
 343    decided.  */
 344 Lisp_Object eol_mnemonic_undecided;
 345
 346 /* Format of end-of-line decided by system.  This is Qunix on
 347    Unix and Mac, Qdos on DOS/Windows.
 348    This has an effect only for external encoding (i.e. for output to
 349    file and process), not for in-buffer or Lisp string encoding.  */
 350 static Lisp_Object system_eol_type;
 351
 352 #ifdef emacs
 353
 354 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 355
 356 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 357
 358 /* Coding system emacs-mule and raw-text are for converting only
 359    end-of-line format.  */
 360 Lisp_Object Qemacs_mule, Qraw_text;
 361 Lisp_Object Qutf_8_emacs;
 362
 363 /* Coding-systems are handed between Emacs Lisp programs and C internal
 364    routines by the following three variables.  */
 365 /* Coding-system for reading files and receiving data from process.  */
 366 Lisp_Object Vcoding_system_for_read;
 367 /* Coding-system for writing files and sending data to process.  */
 368 Lisp_Object Vcoding_system_for_write;
 369 /* Coding-system actually used in the latest I/O.  */
 370 Lisp_Object Vlast_coding_system_used;
 371 /* Set to non-nil when an error is detected while code conversion.  */
 372 Lisp_Object Vlast_code_conversion_error;
 373 /* A vector of length 256 which contains information about special
 374    Latin codes (especially for dealing with Microsoft codes).  */
 375 Lisp_Object Vlatin_extra_code_table;
 376
 377 /* Flag to inhibit code conversion of end-of-line format.  */
 378 int inhibit_eol_conversion;
 379
 380 /* Flag to inhibit ISO2022 escape sequence detection.  */
 381 int inhibit_iso_escape_detection;
 382
 383 /* Flag to inhibit detection of binary files through null bytes.  */
 384 int inhibit_null_byte_detection;
 385
 386 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 387 int inherit_process_coding_system;
 388
 389 /* Coding system to be used to encode text for terminal display when
 390    terminal coding system is nil.  */
 391 struct coding_system safe_terminal_coding;
 392
 393 Lisp_Object Vfile_coding_system_alist;
 394 Lisp_Object Vprocess_coding_system_alist;
 395 Lisp_Object Vnetwork_coding_system_alist;
 396
 397 Lisp_Object Vlocale_coding_system;
 398
 399 #endif /* emacs */
 400
 401 /* Flag to tell if we look up translation table on character code
 402    conversion.  */
 403 Lisp_Object Venable_character_translation;
 404 /* Standard translation table to look up on decoding (reading).  */
 405 Lisp_Object Vstandard_translation_table_for_decode;
 406 /* Standard translation table to look up on encoding (writing).  */
 407 Lisp_Object Vstandard_translation_table_for_encode;
 408
 409 Lisp_Object Qtranslation_table;
 410 Lisp_Object Qtranslation_table_id;
 411 Lisp_Object Qtranslation_table_for_decode;
 412 Lisp_Object Qtranslation_table_for_encode;
 413
 414 /* Alist of charsets vs revision number.  */
 415 static Lisp_Object Vcharset_revision_table;
 416
 417 /* Default coding systems used for process I/O.  */
 418 Lisp_Object Vdefault_process_coding_system;
 419
 420 /* Char table for translating Quail and self-inserting input.  */
 421 Lisp_Object Vtranslation_table_for_input;
 422
 423 /* Two special coding systems.  */
 424 Lisp_Object Vsjis_coding_system;
 425 Lisp_Object Vbig5_coding_system;
 426
 427 /* ISO2022 section */
 428
 429 #define CODING_ISO_INITIAL(coding, reg)                 \
 430   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 431                      coding_attr_iso_initial),          \
 432                reg)))
 433
 434
 435 #define CODING_ISO_REQUEST(coding, charset_id)          \
 436   (((charset_id) <= (coding)->max_charset_id            \
 437     ? ((coding)->safe_charsets[charset_id] != 255       \
 438        ? (coding)->safe_charsets[charset_id]            \
 439        : -1)                                            \
 440     : -1))
 441
 442
 443 #define CODING_ISO_FLAGS(coding)        \
 444   ((coding)->spec.iso_2022.flags)
 445 #define CODING_ISO_DESIGNATION(coding, reg)     \
 446   ((coding)->spec.iso_2022.current_designation[reg])
 447 #define CODING_ISO_INVOCATION(coding, plane)    \
 448   ((coding)->spec.iso_2022.current_invocation[plane])
 449 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 450   ((coding)->spec.iso_2022.single_shifting)
 451 #define CODING_ISO_BOL(coding)  \
 452   ((coding)->spec.iso_2022.bol)
 453 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 454   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 455 #define CODING_ISO_CMP_STATUS(coding)   \
 456   (&(coding)->spec.iso_2022.cmp_status)
 457 #define CODING_ISO_EXTSEGMENT_LEN(coding)       \
 458   ((coding)->spec.iso_2022.ctext_extended_segment_len)
 459 #define CODING_ISO_EMBEDDED_UTF_8(coding)       \
 460   ((coding)->spec.iso_2022.embedded_utf_8)
 461
 462 /* Control characters of ISO2022.  */
 463                         /* code */      /* function */
 464 #define ISO_CODE_LF     0x0A            /* line-feed */
 465 #define ISO_CODE_CR     0x0D            /* carriage-return */
 466 #define ISO_CODE_SO     0x0E            /* shift-out */
 467 #define ISO_CODE_SI     0x0F            /* shift-in */
 468 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 469 #define ISO_CODE_ESC    0x1B            /* escape */
 470 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 471 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 472 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 473
 474 /* All code (1-byte) of ISO2022 is classified into one of the
 475    followings.  */
 476 enum iso_code_class_type
 477   {
 478     ISO_control_0,              /* Control codes in the range
 479                                    0x00..0x1F and 0x7F, except for the
 480                                    following 5 codes.  */
 481     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 482     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 483     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 484     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 485     ISO_control_1,              /* Control codes in the range
 486                                    0x80..0x9F, except for the
 487                                    following 3 codes.  */
 488     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 489     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 490     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 491     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 492     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 493     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 494     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 495   };
 496
 497 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 498     `iso-flags' attribute of an iso2022 coding system.  */
 499
 500 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 501    instead of the correct short-form sequence (e.g. ESC $ A).  */
 502 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 503
 504 /* If set, reset graphic planes and registers at end-of-line to the
 505    initial state.  */
 506 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 507
 508 /* If set, reset graphic planes and registers before any control
 509    characters to the initial state.  */
 510 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 511
 512 /* If set, encode by 7-bit environment.  */
 513 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 514
 515 /* If set, use locking-shift function.  */
 516 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 517
 518 /* If set, use single-shift function.  Overwrite
 519    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 520 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 521
 522 /* If set, use designation escape sequence.  */
 523 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 524
 525 /* If set, produce revision number sequence.  */
 526 #define CODING_ISO_FLAG_REVISION        0x0080
 527
 528 /* If set, produce ISO6429's direction specifying sequence.  */
 529 #define CODING_ISO_FLAG_DIRECTION       0x0100
 530
 531 /* If set, assume designation states are reset at beginning of line on
 532    output.  */
 533 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 534
 535 /* If set, designation sequence should be placed at beginning of line
 536    on output.  */
 537 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 538
 539 /* If set, do not encode unsafe charactes on output.  */
 540 #define CODING_ISO_FLAG_SAFE            0x0800
 541
 542 /* If set, extra latin codes (128..159) are accepted as a valid code
 543    on input.  */
 544 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 545
 546 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 547
 548 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 549
 550 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 551
 552 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 553
 554 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 555
 556 /* A character to be produced on output if encoding of the original
 557    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 558 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 559
 560 /* UTF-8 section */
 561 #define CODING_UTF_8_BOM(coding)        \
 562   ((coding)->spec.utf_8_bom)
 563
 564 /* UTF-16 section */
 565 #define CODING_UTF_16_BOM(coding)       \
 566   ((coding)->spec.utf_16.bom)
 567
 568 #define CODING_UTF_16_ENDIAN(coding)    \
 569   ((coding)->spec.utf_16.endian)
 570
 571 #define CODING_UTF_16_SURROGATE(coding) \
 572   ((coding)->spec.utf_16.surrogate)
 573
 574
 575 /* CCL section */
 576 #define CODING_CCL_DECODER(coding)      \
 577   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 578 #define CODING_CCL_ENCODER(coding)      \
 579   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 580 #define CODING_CCL_VALIDS(coding)                                          \
 581   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 582
 583 /* Index for each coding category in `coding_categories' */
 584
 585 enum coding_category
 586   {
 587     coding_category_iso_7,
 588     coding_category_iso_7_tight,
 589     coding_category_iso_8_1,
 590     coding_category_iso_8_2,
 591     coding_category_iso_7_else,
 592     coding_category_iso_8_else,
 593     coding_category_utf_8_auto,
 594     coding_category_utf_8_nosig,
 595     coding_category_utf_8_sig,
 596     coding_category_utf_16_auto,
 597     coding_category_utf_16_be,
 598     coding_category_utf_16_le,
 599     coding_category_utf_16_be_nosig,
 600     coding_category_utf_16_le_nosig,
 601     coding_category_charset,
 602     coding_category_sjis,
 603     coding_category_big5,
 604     coding_category_ccl,
 605     coding_category_emacs_mule,
 606     /* All above are targets of code detection.  */
 607     coding_category_raw_text,
 608     coding_category_undecided,
 609     coding_category_max
 610   };
 611
 612 /* Definitions of flag bits used in detect_coding_XXXX.  */
 613 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 614 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 615 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 616 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 617 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 618 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 619 #define CATEGORY_MASK_UTF_8_AUTO        (1 << coding_category_utf_8_auto)
 620 #define CATEGORY_MASK_UTF_8_NOSIG       (1 << coding_category_utf_8_nosig)
 621 #define CATEGORY_MASK_UTF_8_SIG         (1 << coding_category_utf_8_sig)
 622 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 623 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 624 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 625 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 626 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 627 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 628 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 629 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 630 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 631 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 632 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 633
 634 /* This value is returned if detect_coding_mask () find nothing other
 635    than ASCII characters.  */
 636 #define CATEGORY_MASK_ANY               \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_8_1              \
 640    | CATEGORY_MASK_ISO_8_2              \
 641    | CATEGORY_MASK_ISO_7_ELSE           \
 642    | CATEGORY_MASK_ISO_8_ELSE           \
 643    | CATEGORY_MASK_UTF_8_AUTO           \
 644    | CATEGORY_MASK_UTF_8_NOSIG          \
 645    | CATEGORY_MASK_UTF_8_SIG            \
 646    | CATEGORY_MASK_UTF_16_AUTO          \
 647    | CATEGORY_MASK_UTF_16_BE            \
 648    | CATEGORY_MASK_UTF_16_LE            \
 649    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 650    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 651    | CATEGORY_MASK_CHARSET              \
 652    | CATEGORY_MASK_SJIS                 \
 653    | CATEGORY_MASK_BIG5                 \
 654    | CATEGORY_MASK_CCL                  \
 655    | CATEGORY_MASK_EMACS_MULE)
 656
 657
 658 #define CATEGORY_MASK_ISO_7BIT \
 659   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 660
 661 #define CATEGORY_MASK_ISO_8BIT \
 662   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 663
 664 #define CATEGORY_MASK_ISO_ELSE \
 665   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 666
 667 #define CATEGORY_MASK_ISO_ESCAPE        \
 668   (CATEGORY_MASK_ISO_7                  \
 669    | CATEGORY_MASK_ISO_7_TIGHT          \
 670    | CATEGORY_MASK_ISO_7_ELSE           \
 671    | CATEGORY_MASK_ISO_8_ELSE)
 672
 673 #define CATEGORY_MASK_ISO       \
 674   (  CATEGORY_MASK_ISO_7BIT     \
 675      | CATEGORY_MASK_ISO_8BIT   \
 676      | CATEGORY_MASK_ISO_ELSE)
 677
 678 #define CATEGORY_MASK_UTF_16            \
 679   (CATEGORY_MASK_UTF_16_AUTO            \
 680    | CATEGORY_MASK_UTF_16_BE            \
 681    | CATEGORY_MASK_UTF_16_LE            \
 682    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 683    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 684
 685 #define CATEGORY_MASK_UTF_8     \
 686   (CATEGORY_MASK_UTF_8_AUTO     \
 687    | CATEGORY_MASK_UTF_8_NOSIG  \
 688    | CATEGORY_MASK_UTF_8_SIG)
 689
 690 /* List of symbols `coding-category-xxx' ordered by priority.  This
 691    variable is exposed to Emacs Lisp.  */
 692 static Lisp_Object Vcoding_category_list;
 693
 694 /* Table of coding categories (Lisp symbols).  This variable is for
 695    internal use oly.  */
 696 static Lisp_Object Vcoding_category_table;
 697
 698 /* Table of coding-categories ordered by priority.  */
 699 static enum coding_category coding_priorities[coding_category_max];
 700
 701 /* Nth element is a coding context for the coding system bound to the
 702    Nth coding category.  */
 703 static struct coding_system coding_categories[coding_category_max];
 704
 705 /*** Commonly used macros and functions ***/
 706
 707 #ifndef min
 708 #define min(a, b) ((a) < (b) ? (a) : (b))
 709 #endif
 710 #ifndef max
 711 #define max(a, b) ((a) > (b) ? (a) : (b))
 712 #endif
 713
 714 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 715   do {                                                  \
 716     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 717     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 718   } while (0)
 719
 720
 721 /* Safely get one byte from the source text pointed by SRC which ends
 722    at SRC_END, and set C to that byte.  If there are not enough bytes
 723    in the source, it jumps to `no_more_source'.  If multibytep is
 724    nonzero, and a multibyte character is found at SRC, set C to the
 725    negative value of the character code.  The caller should declare
 726    and set these variables appropriately in advance:
 727         src, src_end, multibytep */
 728
 729 #define ONE_MORE_BYTE(c)                                \
 730   do {                                                  \
 731     if (src == src_end)                                 \
 732       {                                                 \
 733         if (src_base < src)                             \
 734           record_conversion_result                      \
 735             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 736         goto no_more_source;                            \
 737       }                                                 \
 738     c = *src++;                                         \
 739     if (multibytep && (c & 0x80))                       \
 740       {                                                 \
 741         if ((c & 0xFE) == 0xC0)                         \
 742           c = ((c & 1) << 6) | *src++;                  \
 743         else                                            \
 744           {                                             \
 745             src--;                                      \
 746             c = - string_char (src, &src, NULL);        \
 747             record_conversion_result                    \
 748               (coding, CODING_RESULT_INVALID_SRC);      \
 749           }                                             \
 750       }                                                 \
 751     consumed_chars++;                                   \
 752   } while (0)
 753
 754 /* Safely get two bytes from the source text pointed by SRC which ends
 755    at SRC_END, and set C1 and C2 to those bytes while skipping the
 756    heading multibyte characters.  If there are not enough bytes in the
 757    source, it jumps to `no_more_source'.  If multibytep is nonzero and
 758    a multibyte character is found for C2, set C2 to the negative value
 759    of the character code.  The caller should declare and set these
 760    variables appropriately in advance:
 761         src, src_end, multibytep
 762    It is intended that this macro is used in detect_coding_utf_16.  */
 763
 764 #define TWO_MORE_BYTES(c1, c2)                          \
 765   do {                                                  \
 766     do {                                                \
 767       if (src == src_end)                               \
 768         goto no_more_source;                            \
 769       c1 = *src++;                                      \
 770       if (multibytep && (c1 & 0x80))                    \
 771         {                                               \
 772           if ((c1 & 0xFE) == 0xC0)                      \
 773             c1 = ((c1 & 1) << 6) | *src++;              \
 774           else                                          \
 775             {                                           \
 776               src += BYTES_BY_CHAR_HEAD (c1) - 1;       \
 777               c1 = -1;                                  \
 778             }                                           \
 779         }                                               \
 780     } while (c1 < 0);                                   \
 781     if (src == src_end)                                 \
 782       goto no_more_source;                              \
 783     c2 = *src++;                                        \
 784     if (multibytep && (c2 & 0x80))                      \
 785       {                                                 \
 786         if ((c2 & 0xFE) == 0xC0)                        \
 787           c2 = ((c2 & 1) << 6) | *src++;                \
 788         else                                            \
 789           c2 = -1;                                      \
 790       }                                                 \
 791   } while (0)
 792
 793
 794 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 795   do {                                                  \
 796     c = *src++;                                         \
 797     if (multibytep && (c & 0x80))                       \
 798       {                                                 \
 799         if ((c & 0xFE) == 0xC0)                         \
 800           c = ((c & 1) << 6) | *src++;                  \
 801         else                                            \
 802           {                                             \
 803             src--;                                      \
 804             c = - string_char (src, &src, NULL);        \
 805             record_conversion_result                    \
 806               (coding, CODING_RESULT_INVALID_SRC);      \
 807           }                                             \
 808       }                                                 \
 809     consumed_chars++;                                   \
 810   } while (0)
 811
 812
 813 /* Store a byte C in the place pointed by DST and increment DST to the
 814    next free point, and increment PRODUCED_CHARS.  The caller should
 815    assure that C is 0..127, and declare and set the variable `dst'
 816    appropriately in advance.
 817 */
 818
 819
 820 #define EMIT_ONE_ASCII_BYTE(c)  \
 821   do {                          \
 822     produced_chars++;           \
 823     *dst++ = (c);               \
 824   } while (0)
 825
 826
 827 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 828
 829 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 830   do {                                  \
 831     produced_chars += 2;                \
 832     *dst++ = (c1), *dst++ = (c2);       \
 833   } while (0)
 834
 835
 836 /* Store a byte C in the place pointed by DST and increment DST to the
 837    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 838    nonzero, store in an appropriate multibyte from.  The caller should
 839    declare and set the variables `dst' and `multibytep' appropriately
 840    in advance.  */
 841
 842 #define EMIT_ONE_BYTE(c)                \
 843   do {                                  \
 844     produced_chars++;                   \
 845     if (multibytep)                     \
 846       {                                 \
 847         int ch = (c);                   \
 848         if (ch >= 0x80)                 \
 849           ch = BYTE8_TO_CHAR (ch);      \
 850         CHAR_STRING_ADVANCE (ch, dst);  \
 851       }                                 \
 852     else                                \
 853       *dst++ = (c);                     \
 854   } while (0)
 855
 856
 857 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 858
 859 #define EMIT_TWO_BYTES(c1, c2)          \
 860   do {                                  \
 861     produced_chars += 2;                \
 862     if (multibytep)                     \
 863       {                                 \
 864         int ch;                         \
 865                                         \
 866         ch = (c1);                      \
 867         if (ch >= 0x80)                 \
 868           ch = BYTE8_TO_CHAR (ch);      \
 869         CHAR_STRING_ADVANCE (ch, dst);  \
 870         ch = (c2);                      \
 871         if (ch >= 0x80)                 \
 872           ch = BYTE8_TO_CHAR (ch);      \
 873         CHAR_STRING_ADVANCE (ch, dst);  \
 874       }                                 \
 875     else                                \
 876       {                                 \
 877         *dst++ = (c1);                  \
 878         *dst++ = (c2);                  \
 879       }                                 \
 880   } while (0)
 881
 882
 883 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 884   do {                                  \
 885     EMIT_ONE_BYTE (c1);                 \
 886     EMIT_TWO_BYTES (c2, c3);            \
 887   } while (0)
 888
 889
 890 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 891   do {                                          \
 892     EMIT_TWO_BYTES (c1, c2);                    \
 893     EMIT_TWO_BYTES (c3, c4);                    \
 894   } while (0)
 895
 896
 897 /* Prototypes for static functions.  */
 898 static void record_conversion_result P_ ((struct coding_system *coding,
 899                                           enum coding_result_code result));
 900 static int detect_coding_utf_8 P_ ((struct coding_system *,
 901                                     struct coding_detection_info *info));
 902 static void decode_coding_utf_8 P_ ((struct coding_system *));
 903 static int encode_coding_utf_8 P_ ((struct coding_system *));
 904
 905 static int detect_coding_utf_16 P_ ((struct coding_system *,
 906                                      struct coding_detection_info *info));
 907 static void decode_coding_utf_16 P_ ((struct coding_system *));
 908 static int encode_coding_utf_16 P_ ((struct coding_system *));
 909
 910 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 911                                        struct coding_detection_info *info));
 912 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 913 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 914
 915 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 916                                          struct coding_detection_info *info));
 917 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 918 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 919
 920 static int detect_coding_sjis P_ ((struct coding_system *,
 921                                    struct coding_detection_info *info));
 922 static void decode_coding_sjis P_ ((struct coding_system *));
 923 static int encode_coding_sjis P_ ((struct coding_system *));
 924
 925 static int detect_coding_big5 P_ ((struct coding_system *,
 926                                    struct coding_detection_info *info));
 927 static void decode_coding_big5 P_ ((struct coding_system *));
 928 static int encode_coding_big5 P_ ((struct coding_system *));
 929
 930 static int detect_coding_ccl P_ ((struct coding_system *,
 931                                   struct coding_detection_info *info));
 932 static void decode_coding_ccl P_ ((struct coding_system *));
 933 static int encode_coding_ccl P_ ((struct coding_system *));
 934
 935 static void decode_coding_raw_text P_ ((struct coding_system *));
 936 static int encode_coding_raw_text P_ ((struct coding_system *));
 937
 938 static void coding_set_source P_ ((struct coding_system *));
 939 static void coding_set_destination P_ ((struct coding_system *));
 940 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 941 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 942                                             EMACS_INT, EMACS_INT));
 943 static unsigned char *alloc_destination P_ ((struct coding_system *,
 944                                              EMACS_INT, unsigned char *));
 945 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 946 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 947                                                      int *, int *,
 948                                                      unsigned char *));
 949 static int detect_eol P_ ((const unsigned char *,
 950                            EMACS_INT, enum coding_category));
 951 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 952 static void decode_eol P_ ((struct coding_system *));
 953 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 954 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *));
 955 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 956 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 957                                         EMACS_INT));
 958 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 959 static int decode_coding P_ ((struct coding_system *));
 960 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 961                                                       struct coding_system *,
 962                                                       int *, EMACS_INT *));
 963 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 964                                                   struct coding_system *,
 965                                                   int *, EMACS_INT *));
 966 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 967 static int encode_coding P_ ((struct coding_system *));
 968 static Lisp_Object make_conversion_work_buffer P_ ((int));
 969 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 970 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 971 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 972
 973 static void
 974 record_conversion_result (struct coding_system *coding,
 975                           enum coding_result_code result)
 976 {
 977   coding->result = result;
 978   switch (result)
 979     {
 980     case CODING_RESULT_INSUFFICIENT_SRC:
 981       Vlast_code_conversion_error = Qinsufficient_source;
 982       break;
 983     case CODING_RESULT_INCONSISTENT_EOL:
 984       Vlast_code_conversion_error = Qinconsistent_eol;
 985       break;
 986     case CODING_RESULT_INVALID_SRC:
 987       Vlast_code_conversion_error = Qinvalid_source;
 988       break;
 989     case CODING_RESULT_INTERRUPT:
 990       Vlast_code_conversion_error = Qinterrupted;
 991       break;
 992     case CODING_RESULT_INSUFFICIENT_MEM:
 993       Vlast_code_conversion_error = Qinsufficient_memory;
 994       break;
 995     case CODING_RESULT_SUCCESS:
 996       break;
 997     default:
 998       Vlast_code_conversion_error = intern ("Unknown error");
 999     }
1000 }
1001
1002 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
1003   do {                                                                       \
1004     charset_map_loaded = 0;                                                  \
1005     c = DECODE_CHAR (charset, code);                                         \
1006     if (charset_map_loaded)                                                  \
1007       {                                                                      \
1008         const unsigned char *orig = coding->source;                          \
1009         EMACS_INT offset;                                                    \
1010                                                                              \
1011         coding_set_source (coding);                                          \
1012         offset = coding->source - orig;                                      \
1013         src += offset;                                                       \
1014         src_base += offset;                                                  \
1015         src_end += offset;                                                   \
1016       }                                                                      \
1017   } while (0)
1018
1019
1020 /* If there are at least BYTES length of room at dst, allocate memory
1021    for coding->destination and update dst and dst_end.  We don't have
1022    to take care of coding->source which will be relocated.  It is
1023    handled by calling coding_set_source in encode_coding.  */
1024
1025 #define ASSURE_DESTINATION(bytes)                               \
1026   do {                                                          \
1027     if (dst + (bytes) >= dst_end)                               \
1028       {                                                         \
1029         int more_bytes = charbuf_end - charbuf + (bytes);       \
1030                                                                 \
1031         dst = alloc_destination (coding, more_bytes, dst);      \
1032         dst_end = coding->destination + coding->dst_bytes;      \
1033       }                                                         \
1034   } while (0)
1035
1036
1037 /* Store multibyte form of the character C in P, and advance P to the
1038    end of the multibyte form.  This is like CHAR_STRING_ADVANCE but it
1039    never calls MAYBE_UNIFY_CHAR.  */
1040
1041 #define CHAR_STRING_ADVANCE_NO_UNIFY(c, p)      \
1042   do {                                          \
1043     if ((c) <= MAX_1_BYTE_CHAR)                 \
1044       *(p)++ = (c);                             \
1045     else if ((c) <= MAX_2_BYTE_CHAR)            \
1046       *(p)++ = (0xC0 | ((c) >> 6)),             \
1047         *(p)++ = (0x80 | ((c) & 0x3F));         \
1048     else if ((c) <= MAX_3_BYTE_CHAR)            \
1049       *(p)++ = (0xE0 | ((c) >> 12)),            \
1050         *(p)++ = (0x80 | (((c) >> 6) & 0x3F)),  \
1051         *(p)++ = (0x80 | ((c) & 0x3F));         \
1052     else if ((c) <= MAX_4_BYTE_CHAR)            \
1053       *(p)++ = (0xF0 | (c >> 18)),              \
1054         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1055         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1056         *(p)++ = (0x80 | (c & 0x3F));           \
1057     else if ((c) <= MAX_5_BYTE_CHAR)            \
1058       *(p)++ = 0xF8,                            \
1059         *(p)++ = (0x80 | ((c >> 18) & 0x0F)),   \
1060         *(p)++ = (0x80 | ((c >> 12) & 0x3F)),   \
1061         *(p)++ = (0x80 | ((c >> 6) & 0x3F)),    \
1062         *(p)++ = (0x80 | (c & 0x3F));           \
1063     else                                        \
1064       (p) += BYTE8_STRING ((c) - 0x3FFF80, p);  \
1065   } while (0)
1066
1067
1068 /* Return the character code of character whose multibyte form is at
1069    P, and advance P to the end of the multibyte form.  This is like
1070    STRING_CHAR_ADVANCE, but it never calls MAYBE_UNIFY_CHAR.  */
1071
1072 #define STRING_CHAR_ADVANCE_NO_UNIFY(p)                         \
1073   (!((p)[0] & 0x80)                                             \
1074    ? *(p)++                                                     \
1075    : ! ((p)[0] & 0x20)                                          \
1076    ? ((p) += 2,                                                 \
1077       ((((p)[-2] & 0x1F) << 6)                                  \
1078        | ((p)[-1] & 0x3F)                                       \
1079        | ((unsigned char) ((p)[-2]) < 0xC2 ? 0x3FFF80 : 0)))    \
1080    : ! ((p)[0] & 0x10)                                          \
1081    ? ((p) += 3,                                                 \
1082       ((((p)[-3] & 0x0F) << 12)                                 \
1083        | (((p)[-2] & 0x3F) << 6)                                \
1084        | ((p)[-1] & 0x3F)))                                     \
1085    : ! ((p)[0] & 0x08)                                          \
1086    ? ((p) += 4,                                                 \
1087       ((((p)[-4] & 0xF) << 18)                                  \
1088        | (((p)[-3] & 0x3F) << 12)                               \
1089        | (((p)[-2] & 0x3F) << 6)                                \
1090        | ((p)[-1] & 0x3F)))                                     \
1091    : ((p) += 5,                                                 \
1092       ((((p)[-4] & 0x3F) << 18)                                 \
1093        | (((p)[-3] & 0x3F) << 12)                               \
1094        | (((p)[-2] & 0x3F) << 6)                                \
1095        | ((p)[-1] & 0x3F))))
1096
1097
1098 static void
1099 coding_set_source (coding)
1100      struct coding_system *coding;
1101 {
1102   if (BUFFERP (coding->src_object))
1103     {
1104       struct buffer *buf = XBUFFER (coding->src_object);
1105
1106       if (coding->src_pos < 0)
1107         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
1108       else
1109         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
1110     }
1111   else if (STRINGP (coding->src_object))
1112     {
1113       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
1114     }
1115   else
1116     /* Otherwise, the source is C string and is never relocated
1117        automatically.  Thus we don't have to update anything.  */
1118     ;
1119 }
1120
1121 static void
1122 coding_set_destination (coding)
1123      struct coding_system *coding;
1124 {
1125   if (BUFFERP (coding->dst_object))
1126     {
1127       if (coding->src_pos < 0)
1128         {
1129           coding->destination = BEG_ADDR + coding->dst_pos_byte - BEG_BYTE;
1130           coding->dst_bytes = (GAP_END_ADDR
1131                                - (coding->src_bytes - coding->consumed)
1132                                - coding->destination);
1133         }
1134       else
1135         {
1136           /* We are sure that coding->dst_pos_byte is before the gap
1137              of the buffer. */
1138           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1139                                  + coding->dst_pos_byte - BEG_BYTE);
1140           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1141                                - coding->destination);
1142         }
1143     }
1144   else
1145     /* Otherwise, the destination is C string and is never relocated
1146        automatically.  Thus we don't have to update anything.  */
1147     ;
1148 }
1149
1150
1151 static void
1152 coding_alloc_by_realloc (coding, bytes)
1153      struct coding_system *coding;
1154      EMACS_INT bytes;
1155 {
1156   coding->destination = (unsigned char *) xrealloc (coding->destination,
1157                                                     coding->dst_bytes + bytes);
1158   coding->dst_bytes += bytes;
1159 }
1160
1161 static void
1162 coding_alloc_by_making_gap (coding, gap_head_used, bytes)
1163      struct coding_system *coding;
1164      EMACS_INT gap_head_used, bytes;
1165 {
1166   if (EQ (coding->src_object, coding->dst_object))
1167     {
1168       /* The gap may contain the produced data at the head and not-yet
1169          consumed data at the tail.  To preserve those data, we at
1170          first make the gap size to zero, then increase the gap
1171          size.  */
1172       EMACS_INT add = GAP_SIZE;
1173
1174       GPT += gap_head_used, GPT_BYTE += gap_head_used;
1175       GAP_SIZE = 0; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1176       make_gap (bytes);
1177       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1178       GPT -= gap_head_used, GPT_BYTE -= gap_head_used;
1179     }
1180   else
1181     {
1182       Lisp_Object this_buffer;
1183
1184       this_buffer = Fcurrent_buffer ();
1185       set_buffer_internal (XBUFFER (coding->dst_object));
1186       make_gap (bytes);
1187       set_buffer_internal (XBUFFER (this_buffer));
1188     }
1189 }
1190
1191
1192 static unsigned char *
1193 alloc_destination (coding, nbytes, dst)
1194      struct coding_system *coding;
1195      EMACS_INT nbytes;
1196      unsigned char *dst;
1197 {
1198   EMACS_INT offset = dst - coding->destination;
1199
1200   if (BUFFERP (coding->dst_object))
1201     {
1202       struct buffer *buf = XBUFFER (coding->dst_object);
1203
1204       coding_alloc_by_making_gap (coding, dst - BUF_GPT_ADDR (buf), nbytes);
1205     }
1206   else
1207     coding_alloc_by_realloc (coding, nbytes);
1208   coding_set_destination (coding);
1209   dst = coding->destination + offset;
1210   return dst;
1211 }
1212
1213 /** Macros for annotations.  */
1214
1215 /* An annotation data is stored in the array coding->charbuf in this
1216    format:
1217      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1218    LENGTH is the number of elements in the annotation.
1219    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1220    NCHARS is the number of characters in the text annotated.
1221
1222    The format of the following elements depend on ANNOTATION_MASK.
1223
1224    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1225    follows:
1226      ... NBYTES METHOD [ COMPOSITION-COMPONENTS ... ]
1227
1228    NBYTES is the number of bytes specified in the header part of
1229    old-style emacs-mule encoding, or 0 for the other kind of
1230    composition.
1231
1232    METHOD is one of enum composition_method.
1233
1234    Optionnal COMPOSITION-COMPONENTS are characters and composition
1235    rules.
1236
1237    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1238    follows.
1239
1240    If ANNOTATION_MASK is 0, this annotation is just a space holder to
1241    recover from an invalid annotation, and should be skipped by
1242    produce_annotation.  */
1243
1244 /* Maximum length of the header of annotation data.  */
1245 #define MAX_ANNOTATION_LENGTH 5
1246
1247 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1248   do {                                                  \
1249     *(buf)++ = -(len);                                  \
1250     *(buf)++ = (mask);                                  \
1251     *(buf)++ = (nchars);                                \
1252     coding->annotated = 1;                              \
1253   } while (0);
1254
1255 #define ADD_COMPOSITION_DATA(buf, nchars, nbytes, method)                   \
1256   do {                                                                      \
1257     ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1258     *buf++ = nbytes;                                                        \
1259     *buf++ = method;                                                        \
1260   } while (0)
1261
1262
1263 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1264   do {                                                                  \
1265     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1266     *buf++ = id;                                                        \
1267   } while (0)
1268
1269 \f
1270 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1271
1272
1273
1274 \f
1275 /*** 3. UTF-8 ***/
1276
1277 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1278    Check if a text is encoded in UTF-8.  If it is, return 1, else
1279    return 0.  */
1280
1281 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1282 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1283 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1284 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1285 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1286 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1287
1288 #define UTF_BOM 0xFEFF
1289 #define UTF_8_BOM_1 0xEF
1290 #define UTF_8_BOM_2 0xBB
1291 #define UTF_8_BOM_3 0xBF
1292
1293 static int
1294 detect_coding_utf_8 (coding, detect_info)
1295      struct coding_system *coding;
1296      struct coding_detection_info *detect_info;
1297 {
1298   const unsigned char *src = coding->source, *src_base;
1299   const unsigned char *src_end = coding->source + coding->src_bytes;
1300   int multibytep = coding->src_multibyte;
1301   int consumed_chars = 0;
1302   int bom_found = 0;
1303   int found = 0;
1304
1305   detect_info->checked |= CATEGORY_MASK_UTF_8;
1306   /* A coding system of this category is always ASCII compatible.  */
1307   src += coding->head_ascii;
1308
1309   while (1)
1310     {
1311       int c, c1, c2, c3, c4;
1312
1313       src_base = src;
1314       ONE_MORE_BYTE (c);
1315       if (c < 0 || UTF_8_1_OCTET_P (c))
1316         continue;
1317       ONE_MORE_BYTE (c1);
1318       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1319         break;
1320       if (UTF_8_2_OCTET_LEADING_P (c))
1321         {
1322           found = 1;
1323           continue;
1324         }
1325       ONE_MORE_BYTE (c2);
1326       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1327         break;
1328       if (UTF_8_3_OCTET_LEADING_P (c))
1329         {
1330           found = 1;
1331           if (src_base == coding->source
1332               && c == UTF_8_BOM_1 && c1 == UTF_8_BOM_2 && c2 == UTF_8_BOM_3)
1333             bom_found = 1;
1334           continue;
1335         }
1336       ONE_MORE_BYTE (c3);
1337       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1338         break;
1339       if (UTF_8_4_OCTET_LEADING_P (c))
1340         {
1341           found = 1;
1342           continue;
1343         }
1344       ONE_MORE_BYTE (c4);
1345       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1346         break;
1347       if (UTF_8_5_OCTET_LEADING_P (c))
1348         {
1349           found = 1;
1350           continue;
1351         }
1352       break;
1353     }
1354   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1355   return 0;
1356
1357  no_more_source:
1358   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1359     {
1360       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1361       return 0;
1362     }
1363   if (bom_found)
1364     {
1365       /* The first character 0xFFFE doesn't necessarily mean a BOM.  */
1366       detect_info->found |= CATEGORY_MASK_UTF_8_SIG | CATEGORY_MASK_UTF_8_NOSIG;
1367     }
1368   else
1369     {
1370       detect_info->rejected |= CATEGORY_MASK_UTF_8_SIG;
1371       if (found)
1372         detect_info->found |= CATEGORY_MASK_UTF_8_NOSIG;
1373     }
1374   return 1;
1375 }
1376
1377
1378 static void
1379 decode_coding_utf_8 (coding)
1380      struct coding_system *coding;
1381 {
1382   const unsigned char *src = coding->source + coding->consumed;
1383   const unsigned char *src_end = coding->source + coding->src_bytes;
1384   const unsigned char *src_base;
1385   int *charbuf = coding->charbuf + coding->charbuf_used;
1386   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1387   int consumed_chars = 0, consumed_chars_base = 0;
1388   int multibytep = coding->src_multibyte;
1389   enum utf_bom_type bom = CODING_UTF_8_BOM (coding);
1390   Lisp_Object attr, charset_list;
1391   int eol_crlf =
1392     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1393   int byte_after_cr = -1;
1394
1395   CODING_GET_INFO (coding, attr, charset_list);
1396
1397   if (bom != utf_without_bom)
1398     {
1399       int c1, c2, c3;
1400
1401       src_base = src;
1402       ONE_MORE_BYTE (c1);
1403       if (! UTF_8_3_OCTET_LEADING_P (c1))
1404         src = src_base;
1405       else
1406         {
1407           ONE_MORE_BYTE (c2);
1408           if (! UTF_8_EXTRA_OCTET_P (c2))
1409             src = src_base;
1410           else
1411             {
1412               ONE_MORE_BYTE (c3);
1413               if (! UTF_8_EXTRA_OCTET_P (c3))
1414                 src = src_base;
1415               else
1416                 {
1417                   if ((c1 != UTF_8_BOM_1)
1418                       || (c2 != UTF_8_BOM_2) || (c3 != UTF_8_BOM_3))
1419                     src = src_base;
1420                   else
1421                     CODING_UTF_8_BOM (coding) = utf_without_bom;
1422                 }
1423             }
1424         }
1425     }
1426   CODING_UTF_8_BOM (coding) = utf_without_bom;
1427
1428
1429
1430   while (1)
1431     {
1432       int c, c1, c2, c3, c4, c5;
1433
1434       src_base = src;
1435       consumed_chars_base = consumed_chars;
1436
1437       if (charbuf >= charbuf_end)
1438         {
1439           if (byte_after_cr >= 0)
1440             src_base--;
1441           break;
1442         }
1443
1444       if (byte_after_cr >= 0)
1445         c1 = byte_after_cr, byte_after_cr = -1;
1446       else
1447         ONE_MORE_BYTE (c1);
1448       if (c1 < 0)
1449         {
1450           c = - c1;
1451         }
1452       else if (UTF_8_1_OCTET_P(c1))
1453         {
1454           if (eol_crlf && c1 == '\r')
1455             ONE_MORE_BYTE (byte_after_cr);
1456           c = c1;
1457         }
1458       else
1459         {
1460           ONE_MORE_BYTE (c2);
1461           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1462             goto invalid_code;
1463           if (UTF_8_2_OCTET_LEADING_P (c1))
1464             {
1465               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1466               /* Reject overlong sequences here and below.  Encoders
1467                  producing them are incorrect, they can be misleading,
1468                  and they mess up read/write invariance.  */
1469               if (c < 128)
1470                 goto invalid_code;
1471             }
1472           else
1473             {
1474               ONE_MORE_BYTE (c3);
1475               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1476                 goto invalid_code;
1477               if (UTF_8_3_OCTET_LEADING_P (c1))
1478                 {
1479                   c = (((c1 & 0xF) << 12)
1480                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1481                   if (c < 0x800
1482                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1483                     goto invalid_code;
1484                 }
1485               else
1486                 {
1487                   ONE_MORE_BYTE (c4);
1488                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1489                     goto invalid_code;
1490                   if (UTF_8_4_OCTET_LEADING_P (c1))
1491                     {
1492                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1493                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1494                     if (c < 0x10000)
1495                       goto invalid_code;
1496                     }
1497                   else
1498                     {
1499                       ONE_MORE_BYTE (c5);
1500                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1501                         goto invalid_code;
1502                       if (UTF_8_5_OCTET_LEADING_P (c1))
1503                         {
1504                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1505                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1506                                | (c5 & 0x3F));
1507                           if ((c > MAX_CHAR) || (c < 0x200000))
1508                             goto invalid_code;
1509                         }
1510                       else
1511                         goto invalid_code;
1512                     }
1513                 }
1514             }
1515         }
1516
1517       *charbuf++ = c;
1518       continue;
1519
1520     invalid_code:
1521       src = src_base;
1522       consumed_chars = consumed_chars_base;
1523       ONE_MORE_BYTE (c);
1524       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1525       coding->errors++;
1526     }
1527
1528  no_more_source:
1529   coding->consumed_char += consumed_chars_base;
1530   coding->consumed = src_base - coding->source;
1531   coding->charbuf_used = charbuf - coding->charbuf;
1532 }
1533
1534
1535 static int
1536 encode_coding_utf_8 (coding)
1537      struct coding_system *coding;
1538 {
1539   int multibytep = coding->dst_multibyte;
1540   int *charbuf = coding->charbuf;
1541   int *charbuf_end = charbuf + coding->charbuf_used;
1542   unsigned char *dst = coding->destination + coding->produced;
1543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1544   int produced_chars = 0;
1545   int c;
1546
1547   if (CODING_UTF_8_BOM (coding) == utf_with_bom)
1548     {
1549       ASSURE_DESTINATION (3);
1550       EMIT_THREE_BYTES (UTF_8_BOM_1, UTF_8_BOM_2, UTF_8_BOM_3);
1551       CODING_UTF_8_BOM (coding) = utf_without_bom;
1552     }
1553
1554   if (multibytep)
1555     {
1556       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1557
1558       while (charbuf < charbuf_end)
1559         {
1560           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1561
1562           ASSURE_DESTINATION (safe_room);
1563           c = *charbuf++;
1564           if (CHAR_BYTE8_P (c))
1565             {
1566               c = CHAR_TO_BYTE8 (c);
1567               EMIT_ONE_BYTE (c);
1568             }
1569           else
1570             {
1571               CHAR_STRING_ADVANCE_NO_UNIFY (c, pend);
1572               for (p = str; p < pend; p++)
1573                 EMIT_ONE_BYTE (*p);
1574             }
1575         }
1576     }
1577   else
1578     {
1579       int safe_room = MAX_MULTIBYTE_LENGTH;
1580
1581       while (charbuf < charbuf_end)
1582         {
1583           ASSURE_DESTINATION (safe_room);
1584           c = *charbuf++;
1585           if (CHAR_BYTE8_P (c))
1586             *dst++ = CHAR_TO_BYTE8 (c);
1587           else
1588             CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
1589           produced_chars++;
1590         }
1591     }
1592   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1593   coding->produced_char += produced_chars;
1594   coding->produced = dst - coding->destination;
1595   return 0;
1596 }
1597
1598
1599 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1600    Check if a text is encoded in one of UTF-16 based coding systems.
1601    If it is, return 1, else return 0.  */
1602
1603 #define UTF_16_HIGH_SURROGATE_P(val) \
1604   (((val) & 0xFC00) == 0xD800)
1605
1606 #define UTF_16_LOW_SURROGATE_P(val) \
1607   (((val) & 0xFC00) == 0xDC00)
1608
1609 #define UTF_16_INVALID_P(val)   \
1610   (((val) == 0xFFFE)            \
1611    || ((val) == 0xFFFF)         \
1612    || UTF_16_LOW_SURROGATE_P (val))
1613
1614
1615 static int
1616 detect_coding_utf_16 (coding, detect_info)
1617      struct coding_system *coding;
1618      struct coding_detection_info *detect_info;
1619 {
1620   const unsigned char *src = coding->source, *src_base = src;
1621   const unsigned char *src_end = coding->source + coding->src_bytes;
1622   int multibytep = coding->src_multibyte;
1623   int consumed_chars = 0;
1624   int c1, c2;
1625
1626   detect_info->checked |= CATEGORY_MASK_UTF_16;
1627   if (coding->mode & CODING_MODE_LAST_BLOCK
1628       && (coding->src_chars & 1))
1629     {
1630       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1631       return 0;
1632     }
1633
1634   TWO_MORE_BYTES (c1, c2);
1635   if ((c1 == 0xFF) && (c2 == 0xFE))
1636     {
1637       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1638                              | CATEGORY_MASK_UTF_16_AUTO);
1639       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1640                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1641                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1642     }
1643   else if ((c1 == 0xFE) && (c2 == 0xFF))
1644     {
1645       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1646                              | CATEGORY_MASK_UTF_16_AUTO);
1647       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1648                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1649                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1650     }
1651   else if (c2 < 0)
1652     {
1653       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1654       return 0;
1655     }
1656   else
1657     {
1658       /* We check the dispersion of Eth and Oth bytes where E is even and
1659          O is odd.  If both are high, we assume binary data.*/
1660       unsigned char e[256], o[256];
1661       unsigned e_num = 1, o_num = 1;
1662
1663       memset (e, 0, 256);
1664       memset (o, 0, 256);
1665       e[c1] = 1;
1666       o[c2] = 1;
1667
1668       detect_info->rejected |= (CATEGORY_MASK_UTF_16_AUTO
1669                                 |CATEGORY_MASK_UTF_16_BE
1670                                 | CATEGORY_MASK_UTF_16_LE);
1671
1672       while ((detect_info->rejected & CATEGORY_MASK_UTF_16)
1673              != CATEGORY_MASK_UTF_16)
1674         {
1675           TWO_MORE_BYTES (c1, c2);
1676           if (c2 < 0)
1677             break;
1678           if (! e[c1])
1679             {
1680               e[c1] = 1;
1681               e_num++;
1682               if (e_num >= 128)
1683                 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1684             }
1685           if (! o[c2])
1686             {
1687               o[c2] = 1;
1688               o_num++;
1689               if (o_num >= 128)
1690                 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1691             }
1692         }
1693       return 0;
1694     }
1695
1696  no_more_source:
1697   return 1;
1698 }
1699
1700 static void
1701 decode_coding_utf_16 (coding)
1702      struct coding_system *coding;
1703 {
1704   const unsigned char *src = coding->source + coding->consumed;
1705   const unsigned char *src_end = coding->source + coding->src_bytes;
1706   const unsigned char *src_base;
1707   int *charbuf = coding->charbuf + coding->charbuf_used;
1708   /* We may produces at most 3 chars in one loop.  */
1709   int *charbuf_end = coding->charbuf + coding->charbuf_size - 2;
1710   int consumed_chars = 0, consumed_chars_base = 0;
1711   int multibytep = coding->src_multibyte;
1712   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1713   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1714   int surrogate = CODING_UTF_16_SURROGATE (coding);
1715   Lisp_Object attr, charset_list;
1716   int eol_crlf =
1717     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
1718   int byte_after_cr1 = -1, byte_after_cr2 = -1;
1719
1720   CODING_GET_INFO (coding, attr, charset_list);
1721
1722   if (bom == utf_with_bom)
1723     {
1724       int c, c1, c2;
1725
1726       src_base = src;
1727       ONE_MORE_BYTE (c1);
1728       ONE_MORE_BYTE (c2);
1729       c = (c1 << 8) | c2;
1730
1731       if (endian == utf_16_big_endian
1732           ? c != 0xFEFF : c != 0xFFFE)
1733         {
1734           /* The first two bytes are not BOM.  Treat them as bytes
1735              for a normal character.  */
1736           src = src_base;
1737           coding->errors++;
1738         }
1739       CODING_UTF_16_BOM (coding) = utf_without_bom;
1740     }
1741   else if (bom == utf_detect_bom)
1742     {
1743       /* We have already tried to detect BOM and failed in
1744          detect_coding.  */
1745       CODING_UTF_16_BOM (coding) = utf_without_bom;
1746     }
1747
1748   while (1)
1749     {
1750       int c, c1, c2;
1751
1752       src_base = src;
1753       consumed_chars_base = consumed_chars;
1754
1755       if (charbuf >= charbuf_end)
1756         {
1757           if (byte_after_cr1 >= 0)
1758             src_base -= 2;
1759           break;
1760         }
1761
1762       if (byte_after_cr1 >= 0)
1763         c1 = byte_after_cr1, byte_after_cr1 = -1;
1764       else
1765         ONE_MORE_BYTE (c1);
1766       if (c1 < 0)
1767         {
1768           *charbuf++ = -c1;
1769           continue;
1770         }
1771       if (byte_after_cr2 >= 0)
1772         c2 = byte_after_cr2, byte_after_cr2 = -1;
1773       else
1774         ONE_MORE_BYTE (c2);
1775       if (c2 < 0)
1776         {
1777           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1778           *charbuf++ = -c2;
1779           continue;
1780         }
1781       c = (endian == utf_16_big_endian
1782            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1783
1784       if (surrogate)
1785         {
1786           if (! UTF_16_LOW_SURROGATE_P (c))
1787             {
1788               if (endian == utf_16_big_endian)
1789                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1790               else
1791                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1792               *charbuf++ = c1;
1793               *charbuf++ = c2;
1794               coding->errors++;
1795               if (UTF_16_HIGH_SURROGATE_P (c))
1796                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1797               else
1798                 *charbuf++ = c;
1799             }
1800           else
1801             {
1802               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1803               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1804               *charbuf++ = 0x10000 + c;
1805             }
1806         }
1807       else
1808         {
1809           if (UTF_16_HIGH_SURROGATE_P (c))
1810             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1811           else
1812             {
1813               if (eol_crlf && c == '\r')
1814                 {
1815                   ONE_MORE_BYTE (byte_after_cr1);
1816                   ONE_MORE_BYTE (byte_after_cr2);
1817                 }
1818               *charbuf++ = c;
1819             }
1820         }
1821     }
1822
1823  no_more_source:
1824   coding->consumed_char += consumed_chars_base;
1825   coding->consumed = src_base - coding->source;
1826   coding->charbuf_used = charbuf - coding->charbuf;
1827 }
1828
1829 static int
1830 encode_coding_utf_16 (coding)
1831      struct coding_system *coding;
1832 {
1833   int multibytep = coding->dst_multibyte;
1834   int *charbuf = coding->charbuf;
1835   int *charbuf_end = charbuf + coding->charbuf_used;
1836   unsigned char *dst = coding->destination + coding->produced;
1837   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1838   int safe_room = 8;
1839   enum utf_bom_type bom = CODING_UTF_16_BOM (coding);
1840   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1841   int produced_chars = 0;
1842   Lisp_Object attrs, charset_list;
1843   int c;
1844
1845   CODING_GET_INFO (coding, attrs, charset_list);
1846
1847   if (bom != utf_without_bom)
1848     {
1849       ASSURE_DESTINATION (safe_room);
1850       if (big_endian)
1851         EMIT_TWO_BYTES (0xFE, 0xFF);
1852       else
1853         EMIT_TWO_BYTES (0xFF, 0xFE);
1854       CODING_UTF_16_BOM (coding) = utf_without_bom;
1855     }
1856
1857   while (charbuf < charbuf_end)
1858     {
1859       ASSURE_DESTINATION (safe_room);
1860       c = *charbuf++;
1861       if (c > MAX_UNICODE_CHAR)
1862         c = coding->default_char;
1863
1864       if (c < 0x10000)
1865         {
1866           if (big_endian)
1867             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1868           else
1869             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1870         }
1871       else
1872         {
1873           int c1, c2;
1874
1875           c -= 0x10000;
1876           c1 = (c >> 10) + 0xD800;
1877           c2 = (c & 0x3FF) + 0xDC00;
1878           if (big_endian)
1879             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1880           else
1881             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1882         }
1883     }
1884   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1885   coding->produced = dst - coding->destination;
1886   coding->produced_char += produced_chars;
1887   return 0;
1888 }
1889
1890 \f
1891 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1892
1893 /* Emacs' internal format for representation of multiple character
1894    sets is a kind of multi-byte encoding, i.e. characters are
1895    represented by variable-length sequences of one-byte codes.
1896
1897    ASCII characters and control characters (e.g. `tab', `newline') are
1898    represented by one-byte sequences which are their ASCII codes, in
1899    the range 0x00 through 0x7F.
1900
1901    8-bit characters of the range 0x80..0x9F are represented by
1902    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1903    code + 0x20).
1904
1905    8-bit characters of the range 0xA0..0xFF are represented by
1906    one-byte sequences which are their 8-bit code.
1907
1908    The other characters are represented by a sequence of `base
1909    leading-code', optional `extended leading-code', and one or two
1910    `position-code's.  The length of the sequence is determined by the
1911    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1912    whereas extended leading-code and position-code take the range 0xA0
1913    through 0xFF.  See `charset.h' for more details about leading-code
1914    and position-code.
1915
1916    --- CODE RANGE of Emacs' internal format ---
1917    character set        range
1918    -------------        -----
1919    ascii                0x00..0x7F
1920    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1921    eight-bit-graphic    0xA0..0xBF
1922    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1923    ---------------------------------------------
1924
1925    As this is the internal character representation, the format is
1926    usually not used externally (i.e. in a file or in a data sent to a
1927    process).  But, it is possible to have a text externally in this
1928    format (i.e. by encoding by the coding system `emacs-mule').
1929
1930    In that case, a sequence of one-byte codes has a slightly different
1931    form.
1932
1933    At first, all characters in eight-bit-control are represented by
1934    one-byte sequences which are their 8-bit code.
1935
1936    Next, character composition data are represented by the byte
1937    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1938    where,
1939         METHOD is 0xF2 plus one of composition method (enum
1940         composition_method),
1941
1942         BYTES is 0xA0 plus a byte length of this composition data,
1943
1944         CHARS is 0xA0 plus a number of characters composed by this
1945         data,
1946
1947         COMPONENTs are characters of multibye form or composition
1948         rules encoded by two-byte of ASCII codes.
1949
1950    In addition, for backward compatibility, the following formats are
1951    also recognized as composition data on decoding.
1952
1953    0x80 MSEQ ...
1954    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1955
1956    Here,
1957         MSEQ is a multibyte form but in these special format:
1958           ASCII: 0xA0 ASCII_CODE+0x80,
1959           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1960         RULE is a one byte code of the range 0xA0..0xF0 that
1961         represents a composition rule.
1962   */
1963
1964 char emacs_mule_bytes[256];
1965
1966
1967 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1968    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1969    else return 0.  */
1970
1971 static int
1972 detect_coding_emacs_mule (coding, detect_info)
1973      struct coding_system *coding;
1974      struct coding_detection_info *detect_info;
1975 {
1976   const unsigned char *src = coding->source, *src_base;
1977   const unsigned char *src_end = coding->source + coding->src_bytes;
1978   int multibytep = coding->src_multibyte;
1979   int consumed_chars = 0;
1980   int c;
1981   int found = 0;
1982
1983   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1984   /* A coding system of this category is always ASCII compatible.  */
1985   src += coding->head_ascii;
1986
1987   while (1)
1988     {
1989       src_base = src;
1990       ONE_MORE_BYTE (c);
1991       if (c < 0)
1992         continue;
1993       if (c == 0x80)
1994         {
1995           /* Perhaps the start of composite character.  We simply skip
1996              it because analyzing it is too heavy for detecting.  But,
1997              at least, we check that the composite character
1998              constitutes of more than 4 bytes.  */
1999           const unsigned char *src_base;
2000
2001         repeat:
2002           src_base = src;
2003           do
2004             {
2005               ONE_MORE_BYTE (c);
2006             }
2007           while (c >= 0xA0);
2008
2009           if (src - src_base <= 4)
2010             break;
2011           found = CATEGORY_MASK_EMACS_MULE;
2012           if (c == 0x80)
2013             goto repeat;
2014         }
2015
2016       if (c < 0x80)
2017         {
2018           if (c < 0x20
2019               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
2020             break;
2021         }
2022       else
2023         {
2024           int more_bytes = emacs_mule_bytes[*src_base] - 1;
2025
2026           while (more_bytes > 0)
2027             {
2028               ONE_MORE_BYTE (c);
2029               if (c < 0xA0)
2030                 {
2031                   src--;        /* Unread the last byte.  */
2032                   break;
2033                 }
2034               more_bytes--;
2035             }
2036           if (more_bytes != 0)
2037             break;
2038           found = CATEGORY_MASK_EMACS_MULE;
2039         }
2040     }
2041   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2042   return 0;
2043
2044  no_more_source:
2045   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
2046     {
2047       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
2048       return 0;
2049     }
2050   detect_info->found |= found;
2051   return 1;
2052 }
2053
2054
2055 /* Parse emacs-mule multibyte sequence at SRC and return the decoded
2056    character.  If CMP_STATUS indicates that we must expect MSEQ or
2057    RULE described above, decode it and return the negative value of
2058    the deocded character or rule.  If an invalid byte is found, return
2059    -1.  If SRC is too short, return -2.  */
2060
2061 int
2062 emacs_mule_char (coding, src, nbytes, nchars, id, cmp_status)
2063      struct coding_system *coding;
2064      const unsigned char *src;
2065      int *nbytes, *nchars, *id;
2066      struct composition_status *cmp_status;
2067 {
2068   const unsigned char *src_end = coding->source + coding->src_bytes;
2069   const unsigned char *src_base = src;
2070   int multibytep = coding->src_multibyte;
2071   struct charset *charset;
2072   unsigned code;
2073   int c;
2074   int consumed_chars = 0;
2075   int mseq_found = 0;
2076
2077   ONE_MORE_BYTE (c);
2078   if (c < 0)
2079     {
2080       c = -c;
2081       charset = emacs_mule_charset[0];
2082     }
2083   else
2084     {
2085       if (c >= 0xA0)
2086         {
2087           if (cmp_status->state != COMPOSING_NO
2088               && cmp_status->old_form)
2089             {
2090               if (cmp_status->state == COMPOSING_CHAR)
2091                 {
2092                   if (c == 0xA0)
2093                     {
2094                       ONE_MORE_BYTE (c);
2095                       c -= 0x80;
2096                       if (c < 0)
2097                         goto invalid_code;
2098                     }
2099                   else
2100                     c -= 0x20;
2101                   mseq_found = 1;
2102                 }
2103               else
2104                 {
2105                   *nbytes = src - src_base;
2106                   *nchars = consumed_chars;
2107                   return -c;
2108                 }
2109             }
2110           else
2111             goto invalid_code;
2112         }
2113
2114       switch (emacs_mule_bytes[c])
2115         {
2116         case 2:
2117           if (! (charset = emacs_mule_charset[c]))
2118             goto invalid_code;
2119           ONE_MORE_BYTE (c);
2120           if (c < 0xA0)
2121             goto invalid_code;
2122           code = c & 0x7F;
2123           break;
2124
2125         case 3:
2126           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
2127               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
2128             {
2129               ONE_MORE_BYTE (c);
2130               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
2131                 goto invalid_code;
2132               ONE_MORE_BYTE (c);
2133               if (c < 0xA0)
2134                 goto invalid_code;
2135               code = c & 0x7F;
2136             }
2137           else
2138             {
2139               if (! (charset = emacs_mule_charset[c]))
2140                 goto invalid_code;
2141               ONE_MORE_BYTE (c);
2142               if (c < 0xA0)
2143                 goto invalid_code;
2144               code = (c & 0x7F) << 8;
2145               ONE_MORE_BYTE (c);
2146               if (c < 0xA0)
2147                 goto invalid_code;
2148               code |= c & 0x7F;
2149             }
2150           break;
2151
2152         case 4:
2153           ONE_MORE_BYTE (c);
2154           if (c < 0 || ! (charset = emacs_mule_charset[c]))
2155             goto invalid_code;
2156           ONE_MORE_BYTE (c);
2157           if (c < 0xA0)
2158             goto invalid_code;
2159           code = (c & 0x7F) << 8;
2160           ONE_MORE_BYTE (c);
2161           if (c < 0xA0)
2162             goto invalid_code;
2163           code |= c & 0x7F;
2164           break;
2165
2166         case 1:
2167           code = c;
2168           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
2169                                      ? charset_ascii : charset_eight_bit);
2170           break;
2171
2172         default:
2173           abort ();
2174         }
2175       c = DECODE_CHAR (charset, code);
2176       if (c < 0)
2177         goto invalid_code;
2178     }
2179   *nbytes = src - src_base;
2180   *nchars = consumed_chars;
2181   if (id)
2182     *id = charset->id;
2183   return (mseq_found ? -c : c);
2184
2185  no_more_source:
2186   return -2;
2187
2188  invalid_code:
2189   return -1;
2190 }
2191
2192
2193 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2194
2195 /* Handle these composition sequence ('|': the end of header elements,
2196    BYTES and CHARS >= 0xA0):
2197
2198    (1) relative composition: 0x80 0xF2 BYTES CHARS | CHAR ...
2199    (2) altchar composition:  0x80 0xF4 BYTES CHARS | ALT ... ALT CHAR ...
2200    (3) alt&rule composition: 0x80 0xF5 BYTES CHARS | ALT RULE ... ALT CHAR ...
2201
2202    and these old form:
2203
2204    (4) relative composition: 0x80 | MSEQ ... MSEQ
2205    (5) rulebase composition: 0x80 0xFF | MSEQ MRULE ... MSEQ
2206
2207    When the starter 0x80 and the following header elements are found,
2208    this annotation header is produced.
2209
2210         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS NBYTES METHOD ]
2211
2212    NCHARS is CHARS - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2213    NBYTES is BYTES - 0xA0 for (1), (2), (3), and 0 for (4), (5).
2214
2215    Then, upon reading the following elements, these codes are produced
2216    until the composition end is found:
2217
2218    (1) CHAR ... CHAR
2219    (2) ALT ... ALT CHAR ... CHAR
2220    (3) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT CHAR ... CHAR
2221    (4) CHAR ... CHAR
2222    (5) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
2223
2224    When the composition end is found, LENGTH and NCHARS in the
2225    annotation header is updated as below:
2226
2227    (1) LENGTH: unchanged, NCHARS: unchanged
2228    (2) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2229    (3) LENGTH: length of the whole sequence minus NCHARS, NCHARS: unchanged
2230    (4) LENGTH: unchanged,  NCHARS: number of CHARs
2231    (5) LENGTH: unchanged,  NCHARS: number of CHARs
2232
2233    If an error is found while composing, the annotation header is
2234    changed to the original composition header (plus filler -1s) as
2235    below:
2236
2237    (1),(2),(3)  [ 0x80 0xF2+METHOD BYTES CHARS -1 ]
2238    (5)          [ 0x80 0xFF -1 -1- -1 ]
2239
2240    and the sequence [ -2 DECODED-RULE ] is changed to the original
2241    byte sequence as below:
2242         o the original byte sequence is B: [ B -1 ]
2243         o the original byte sequence is B1 B2: [ B1 B2 ]
2244
2245    Most of the routines are implemented by macros because many
2246    variables and labels in the caller decode_coding_emacs_mule must be
2247    accessible, and they are usually called just once (thus doesn't
2248    increase the size of compiled object).  */
2249
2250 /* Decode a composition rule represented by C as a component of
2251    composition sequence of Emacs 20 style.  Set RULE to the decoded
2252    rule. */
2253
2254 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(c, rule)  \
2255   do {                                                  \
2256     int gref, nref;                                     \
2257                                                         \
2258     c -= 0xA0;                                          \
2259     if (c < 0 || c >= 81)                               \
2260       goto invalid_code;                                \
2261     gref = c / 9, nref = c % 9;                         \
2262     if (gref == 4) gref = 10;                           \
2263     if (nref == 4) nref = 10;                           \
2264     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2265   } while (0)
2266
2267
2268 /* Decode a composition rule represented by C and the following byte
2269    at SRC as a component of composition sequence of Emacs 21 style.
2270    Set RULE to the decoded rule.  */
2271
2272 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(c, rule)  \
2273   do {                                                  \
2274     int gref, nref;                                     \
2275                                                         \
2276     gref = c - 0x20;                                    \
2277     if (gref < 0 || gref >= 81)                         \
2278       goto invalid_code;                                \
2279     ONE_MORE_BYTE (c);                                  \
2280     nref = c - 0x20;                                    \
2281     if (nref < 0 || nref >= 81)                         \
2282       goto invalid_code;                                \
2283     rule = COMPOSITION_ENCODE_RULE (gref, nref);        \
2284   } while (0)
2285
2286
2287 /* Start of Emacs 21 style format.  The first three bytes at SRC are
2288    (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is the
2289    byte length of this composition information, CHARS is the number of
2290    characters composed by this composition.  */
2291
2292 #define DECODE_EMACS_MULE_21_COMPOSITION()                              \
2293   do {                                                                  \
2294     enum composition_method method = c - 0xF2;                          \
2295     int *charbuf_base = charbuf;                                        \
2296     int nbytes, nchars;                                                 \
2297                                                                         \
2298     ONE_MORE_BYTE (c);                                                  \
2299     if (c < 0)                                                          \
2300       goto invalid_code;                                                \
2301     nbytes = c - 0xA0;                                                  \
2302     if (nbytes < 3 || (method == COMPOSITION_RELATIVE && nbytes != 4))  \
2303       goto invalid_code;                                                \
2304     ONE_MORE_BYTE (c);                                                  \
2305     nchars = c - 0xA0;                                                  \
2306     if (nchars <= 0 || nchars >= MAX_COMPOSITION_COMPONENTS)            \
2307       goto invalid_code;                                                \
2308     cmp_status->old_form = 0;                                           \
2309     cmp_status->method = method;                                        \
2310     if (method == COMPOSITION_RELATIVE)                                 \
2311       cmp_status->state = COMPOSING_CHAR;                               \
2312     else                                                                \
2313       cmp_status->state = COMPOSING_COMPONENT_CHAR;                     \
2314     cmp_status->length = MAX_ANNOTATION_LENGTH;                         \
2315     cmp_status->nchars = nchars;                                        \
2316     cmp_status->ncomps = nbytes - 4;                                    \
2317     ADD_COMPOSITION_DATA (charbuf, nchars, nbytes, method);             \
2318   } while (0)
2319
2320
2321 /* Start of Emacs 20 style format for relative composition.  */
2322
2323 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION()             \
2324   do {                                                          \
2325     cmp_status->old_form = 1;                                   \
2326     cmp_status->method = COMPOSITION_RELATIVE;                  \
2327     cmp_status->state = COMPOSING_CHAR;                         \
2328     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2329     cmp_status->nchars = cmp_status->ncomps = 0;                \
2330     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2331   } while (0)
2332
2333
2334 /* Start of Emacs 20 style format for rule-base composition.  */
2335
2336 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION()             \
2337   do {                                                          \
2338     cmp_status->old_form = 1;                                   \
2339     cmp_status->method = COMPOSITION_WITH_RULE;                 \
2340     cmp_status->state = COMPOSING_CHAR;                         \
2341     cmp_status->length = MAX_ANNOTATION_LENGTH;                 \
2342     cmp_status->nchars = cmp_status->ncomps = 0;                \
2343     ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);   \
2344   } while (0)
2345
2346
2347 #define DECODE_EMACS_MULE_COMPOSITION_START()           \
2348   do {                                                  \
2349     const unsigned char *current_src = src;             \
2350                                                         \
2351     ONE_MORE_BYTE (c);                                  \
2352     if (c < 0)                                          \
2353       goto invalid_code;                                \
2354     if (c - 0xF2 >= COMPOSITION_RELATIVE                \
2355         && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)  \
2356       DECODE_EMACS_MULE_21_COMPOSITION ();              \
2357     else if (c < 0xA0)                                  \
2358       goto invalid_code;                                \
2359     else if (c < 0xC0)                                  \
2360       {                                                 \
2361         DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION ();   \
2362         /* Re-read C as a composition component.  */    \
2363         src = current_src;                              \
2364       }                                                 \
2365     else if (c == 0xFF)                                 \
2366       DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION ();     \
2367     else                                                \
2368       goto invalid_code;                                \
2369   } while (0)
2370
2371 #define EMACS_MULE_COMPOSITION_END()                            \
2372   do {                                                          \
2373     int idx = - cmp_status->length;                             \
2374                                                                 \
2375     if (cmp_status->old_form)                                   \
2376       charbuf[idx + 2] = cmp_status->nchars;                    \
2377     else if (cmp_status->method > COMPOSITION_RELATIVE)         \
2378       charbuf[idx] = charbuf[idx + 2] - cmp_status->length;     \
2379     cmp_status->state = COMPOSING_NO;                           \
2380   } while (0)
2381
2382
2383 static int
2384 emacs_mule_finish_composition (charbuf, cmp_status)
2385      int *charbuf;
2386      struct composition_status *cmp_status;
2387 {
2388   int idx = - cmp_status->length;
2389   int new_chars;
2390
2391   if (cmp_status->old_form && cmp_status->nchars > 0)
2392     {
2393       charbuf[idx + 2] = cmp_status->nchars;
2394       new_chars = 0;
2395       if (cmp_status->method == COMPOSITION_WITH_RULE
2396           && cmp_status->state == COMPOSING_CHAR)
2397         {
2398           /* The last rule was invalid.  */
2399           int rule = charbuf[-1] + 0xA0;
2400
2401           charbuf[-2] = BYTE8_TO_CHAR (rule);
2402           charbuf[-1] = -1;
2403           new_chars = 1;
2404         }
2405     }
2406   else
2407     {
2408       charbuf[idx++] = BYTE8_TO_CHAR (0x80);
2409
2410       if (cmp_status->method == COMPOSITION_WITH_RULE)
2411         {
2412           charbuf[idx++] = BYTE8_TO_CHAR (0xFF);
2413           charbuf[idx++] = -3;
2414           charbuf[idx++] = 0;
2415           new_chars = 1;
2416         }
2417       else
2418         {
2419           int nchars = charbuf[idx + 1] + 0xA0;
2420           int nbytes = charbuf[idx + 2] + 0xA0;
2421
2422           charbuf[idx++] = BYTE8_TO_CHAR (0xF2 + cmp_status->method);
2423           charbuf[idx++] = BYTE8_TO_CHAR (nbytes);
2424           charbuf[idx++] = BYTE8_TO_CHAR (nchars);
2425           charbuf[idx++] = -1;
2426           new_chars = 4;
2427         }
2428     }
2429   cmp_status->state = COMPOSING_NO;
2430   return new_chars;
2431 }
2432
2433 #define EMACS_MULE_MAYBE_FINISH_COMPOSITION()                             \
2434   do {                                                                    \
2435     if (cmp_status->state != COMPOSING_NO)                                \
2436       char_offset += emacs_mule_finish_composition (charbuf, cmp_status); \
2437   } while (0)
2438
2439
2440 static void
2441 decode_coding_emacs_mule (coding)
2442      struct coding_system *coding;
2443 {
2444   const unsigned char *src = coding->source + coding->consumed;
2445   const unsigned char *src_end = coding->source + coding->src_bytes;
2446   const unsigned char *src_base;
2447   int *charbuf = coding->charbuf + coding->charbuf_used;
2448   /* We may produce two annocations (charset and composition) in one
2449      loop and one more charset annocation at the end.  */
2450   int *charbuf_end
2451     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
2452   int consumed_chars = 0, consumed_chars_base;
2453   int multibytep = coding->src_multibyte;
2454   Lisp_Object attrs, charset_list;
2455   int char_offset = coding->produced_char;
2456   int last_offset = char_offset;
2457   int last_id = charset_ascii;
2458   int eol_crlf =
2459     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
2460   int byte_after_cr = -1;
2461   struct composition_status *cmp_status = &coding->spec.emacs_mule.cmp_status;
2462
2463   CODING_GET_INFO (coding, attrs, charset_list);
2464
2465   if (cmp_status->state != COMPOSING_NO)
2466     {
2467       int i;
2468
2469       for (i = 0; i < cmp_status->length; i++)
2470         *charbuf++ = cmp_status->carryover[i];
2471       coding->annotated = 1;
2472     }
2473
2474   while (1)
2475     {
2476       int c, id;
2477
2478       src_base = src;
2479       consumed_chars_base = consumed_chars;
2480
2481       if (charbuf >= charbuf_end)
2482         {
2483           if (byte_after_cr >= 0)
2484             src_base--;
2485           break;
2486         }
2487
2488       if (byte_after_cr >= 0)
2489         c = byte_after_cr, byte_after_cr = -1;
2490       else
2491         ONE_MORE_BYTE (c);
2492
2493       if (c < 0 || c == 0x80)
2494         {
2495           EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2496           if (c < 0)
2497             {
2498               *charbuf++ = -c;
2499               char_offset++;
2500             }
2501           else
2502             DECODE_EMACS_MULE_COMPOSITION_START ();
2503           continue;
2504         }
2505
2506       if (c < 0x80)
2507         {
2508           if (eol_crlf && c == '\r')
2509             ONE_MORE_BYTE (byte_after_cr);
2510           id = charset_ascii;
2511           if (cmp_status->state != COMPOSING_NO)
2512             {
2513               if (cmp_status->old_form)
2514                 EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2515               else if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2516                 cmp_status->ncomps--;
2517             }
2518         }
2519       else
2520         {
2521           int nchars, nbytes;
2522
2523           c = emacs_mule_char (coding, src_base, &nbytes, &nchars, &id,
2524                                cmp_status);
2525           if (c < 0)
2526             {
2527               if (c == -1)
2528                 goto invalid_code;
2529               if (c == -2)
2530                 break;
2531             }
2532           src = src_base + nbytes;
2533           consumed_chars = consumed_chars_base + nchars;
2534           if (cmp_status->state >= COMPOSING_COMPONENT_CHAR)
2535             cmp_status->ncomps -= nchars;
2536         }
2537
2538       /* Now if C >= 0, we found a normally encoded characer, if C <
2539          0, we found an old-style composition component character or
2540          rule.  */
2541
2542       if (cmp_status->state == COMPOSING_NO)
2543         {
2544           if (last_id != id)
2545             {
2546               if (last_id != charset_ascii)
2547                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset,
2548                                   last_id);
2549               last_id = id;
2550               last_offset = char_offset;
2551             }
2552           *charbuf++ = c;
2553           char_offset++;
2554         }
2555       else if (cmp_status->state == COMPOSING_CHAR)
2556         {
2557           if (cmp_status->old_form)
2558             {
2559               if (c >= 0)
2560                 {
2561                   EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2562                   *charbuf++ = c;
2563                   char_offset++;
2564                 }
2565               else
2566                 {
2567                   *charbuf++ = -c;
2568                   cmp_status->nchars++;
2569                   cmp_status->length++;
2570                   if (cmp_status->nchars == MAX_COMPOSITION_COMPONENTS)
2571                     EMACS_MULE_COMPOSITION_END ();
2572                   else if (cmp_status->method == COMPOSITION_WITH_RULE)
2573                     cmp_status->state = COMPOSING_RULE;
2574                 }
2575             }
2576           else
2577             {
2578               *charbuf++ = c;
2579               cmp_status->length++;
2580               cmp_status->nchars--;
2581               if (cmp_status->nchars == 0)
2582                 EMACS_MULE_COMPOSITION_END ();
2583             }
2584         }
2585       else if (cmp_status->state == COMPOSING_RULE)
2586         {
2587           int rule;
2588
2589           if (c >= 0)
2590             {
2591               EMACS_MULE_COMPOSITION_END ();
2592               *charbuf++ = c;
2593               char_offset++;
2594             }
2595           else
2596             {
2597               c = -c;
2598               DECODE_EMACS_MULE_COMPOSITION_RULE_20 (c, rule);
2599               if (rule < 0)
2600                 goto invalid_code;
2601               *charbuf++ = -2;
2602               *charbuf++ = rule;
2603               cmp_status->length += 2;
2604               cmp_status->state = COMPOSING_CHAR;
2605             }
2606         }
2607       else if (cmp_status->state == COMPOSING_COMPONENT_CHAR)
2608         {
2609           *charbuf++ = c;
2610           cmp_status->length++;
2611           if (cmp_status->ncomps == 0)
2612             cmp_status->state = COMPOSING_CHAR;
2613           else if (cmp_status->ncomps > 0)
2614             {
2615               if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)
2616                 cmp_status->state = COMPOSING_COMPONENT_RULE;
2617             }
2618           else
2619             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2620         }
2621       else                      /* COMPOSING_COMPONENT_RULE */
2622         {
2623           int rule;
2624
2625           DECODE_EMACS_MULE_COMPOSITION_RULE_21 (c, rule);
2626           if (rule < 0)
2627             goto invalid_code;
2628           *charbuf++ = -2;
2629           *charbuf++ = rule;
2630           cmp_status->length += 2;
2631           cmp_status->ncomps--;
2632           if (cmp_status->ncomps > 0)
2633             cmp_status->state = COMPOSING_COMPONENT_CHAR;
2634           else
2635             EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2636         }
2637       continue;
2638
2639     retry:
2640       src = src_base;
2641       consumed_chars = consumed_chars_base;
2642       continue;
2643
2644     invalid_code:
2645       EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2646       src = src_base;
2647       consumed_chars = consumed_chars_base;
2648       ONE_MORE_BYTE (c);
2649       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2650       char_offset++;
2651       coding->errors++;
2652     }
2653
2654  no_more_source:
2655   if (cmp_status->state != COMPOSING_NO)
2656     {
2657       if (coding->mode & CODING_MODE_LAST_BLOCK)
2658         EMACS_MULE_MAYBE_FINISH_COMPOSITION ();
2659       else
2660         {
2661           int i;
2662
2663           charbuf -= cmp_status->length;
2664           for (i = 0; i < cmp_status->length; i++)
2665             cmp_status->carryover[i] = charbuf[i];
2666         }
2667     }
2668   if (last_id != charset_ascii)
2669     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2670   coding->consumed_char += consumed_chars_base;
2671   coding->consumed = src_base - coding->source;
2672   coding->charbuf_used = charbuf - coding->charbuf;
2673 }
2674
2675
2676 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2677   do {                                          \
2678     if (id < 0xA0)                              \
2679       codes[0] = id, codes[1] = 0;              \
2680     else if (id < 0xE0)                         \
2681       codes[0] = 0x9A, codes[1] = id;           \
2682     else if (id < 0xF0)                         \
2683       codes[0] = 0x9B, codes[1] = id;           \
2684     else if (id < 0xF5)                         \
2685       codes[0] = 0x9C, codes[1] = id;           \
2686     else                                        \
2687       codes[0] = 0x9D, codes[1] = id;           \
2688   } while (0);
2689
2690
2691 static int
2692 encode_coding_emacs_mule (coding)
2693      struct coding_system *coding;
2694 {
2695   int multibytep = coding->dst_multibyte;
2696   int *charbuf = coding->charbuf;
2697   int *charbuf_end = charbuf + coding->charbuf_used;
2698   unsigned char *dst = coding->destination + coding->produced;
2699   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2700   int safe_room = 8;
2701   int produced_chars = 0;
2702   Lisp_Object attrs, charset_list;
2703   int c;
2704   int preferred_charset_id = -1;
2705
2706   CODING_GET_INFO (coding, attrs, charset_list);
2707   if (! EQ (charset_list, Vemacs_mule_charset_list))
2708     {
2709       CODING_ATTR_CHARSET_LIST (attrs)
2710         = charset_list = Vemacs_mule_charset_list;
2711     }
2712
2713   while (charbuf < charbuf_end)
2714     {
2715       ASSURE_DESTINATION (safe_room);
2716       c = *charbuf++;
2717
2718       if (c < 0)
2719         {
2720           /* Handle an annotation.  */
2721           switch (*charbuf)
2722             {
2723             case CODING_ANNOTATE_COMPOSITION_MASK:
2724               /* Not yet implemented.  */
2725               break;
2726             case CODING_ANNOTATE_CHARSET_MASK:
2727               preferred_charset_id = charbuf[3];
2728               if (preferred_charset_id >= 0
2729                   && NILP (Fmemq (make_number (preferred_charset_id),
2730                                   charset_list)))
2731                 preferred_charset_id = -1;
2732               break;
2733             default:
2734               abort ();
2735             }
2736           charbuf += -c - 1;
2737           continue;
2738         }
2739
2740       if (ASCII_CHAR_P (c))
2741         EMIT_ONE_ASCII_BYTE (c);
2742       else if (CHAR_BYTE8_P (c))
2743         {
2744           c = CHAR_TO_BYTE8 (c);
2745           EMIT_ONE_BYTE (c);
2746         }
2747       else
2748         {
2749           struct charset *charset;
2750           unsigned code;
2751           int dimension;
2752           int emacs_mule_id;
2753           unsigned char leading_codes[2];
2754
2755           if (preferred_charset_id >= 0)
2756             {
2757               charset = CHARSET_FROM_ID (preferred_charset_id);
2758               if (CHAR_CHARSET_P (c, charset))
2759                 code = ENCODE_CHAR (charset, c);
2760               else
2761                 charset = char_charset (c, charset_list, &code);
2762             }
2763           else
2764             charset = char_charset (c, charset_list, &code);
2765           if (! charset)
2766             {
2767               c = coding->default_char;
2768               if (ASCII_CHAR_P (c))
2769                 {
2770                   EMIT_ONE_ASCII_BYTE (c);
2771                   continue;
2772                 }
2773               charset = char_charset (c, charset_list, &code);
2774             }
2775           dimension = CHARSET_DIMENSION (charset);
2776           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2777           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2778           EMIT_ONE_BYTE (leading_codes[0]);
2779           if (leading_codes[1])
2780             EMIT_ONE_BYTE (leading_codes[1]);
2781           if (dimension == 1)
2782             EMIT_ONE_BYTE (code | 0x80);
2783           else
2784             {
2785               code |= 0x8080;
2786               EMIT_ONE_BYTE (code >> 8);
2787               EMIT_ONE_BYTE (code & 0xFF);
2788             }
2789         }
2790     }
2791   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2792   coding->produced_char += produced_chars;
2793   coding->produced = dst - coding->destination;
2794   return 0;
2795 }
2796
2797 \f
2798 /*** 7. ISO2022 handlers ***/
2799
2800 /* The following note describes the coding system ISO2022 briefly.
2801    Since the intention of this note is to help understand the
2802    functions in this file, some parts are NOT ACCURATE or are OVERLY
2803    SIMPLIFIED.  For thorough understanding, please refer to the
2804    original document of ISO2022.  This is equivalent to the standard
2805    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2806
2807    ISO2022 provides many mechanisms to encode several character sets
2808    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2809    is encoded using bytes less than 128.  This may make the encoded
2810    text a little bit longer, but the text passes more easily through
2811    several types of gateway, some of which strip off the MSB (Most
2812    Significant Bit).
2813
2814    There are two kinds of character sets: control character sets and
2815    graphic character sets.  The former contain control characters such
2816    as `newline' and `escape' to provide control functions (control
2817    functions are also provided by escape sequences).  The latter
2818    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2819    two control character sets and many graphic character sets.
2820
2821    Graphic character sets are classified into one of the following
2822    four classes, according to the number of bytes (DIMENSION) and
2823    number of characters in one dimension (CHARS) of the set:
2824    - DIMENSION1_CHARS94
2825    - DIMENSION1_CHARS96
2826    - DIMENSION2_CHARS94
2827    - DIMENSION2_CHARS96
2828
2829    In addition, each character set is assigned an identification tag,
2830    unique for each set, called the "final character" (denoted as <F>
2831    hereafter).  The <F> of each character set is decided by ECMA(*)
2832    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2833    (0x30..0x3F are for private use only).
2834
2835    Note (*): ECMA = European Computer Manufacturers Association
2836
2837    Here are examples of graphic character sets [NAME(<F>)]:
2838         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2839         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2840         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2841         o DIMENSION2_CHARS96 -- none for the moment
2842
2843    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2844         C0 [0x00..0x1F] -- control character plane 0
2845         GL [0x20..0x7F] -- graphic character plane 0
2846         C1 [0x80..0x9F] -- control character plane 1
2847         GR [0xA0..0xFF] -- graphic character plane 1
2848
2849    A control character set is directly designated and invoked to C0 or
2850    C1 by an escape sequence.  The most common case is that:
2851    - ISO646's  control character set is designated/invoked to C0, and
2852    - ISO6429's control character set is designated/invoked to C1,
2853    and usually these designations/invocations are omitted in encoded
2854    text.  In a 7-bit environment, only C0 can be used, and a control
2855    character for C1 is encoded by an appropriate escape sequence to
2856    fit into the environment.  All control characters for C1 are
2857    defined to have corresponding escape sequences.
2858
2859    A graphic character set is at first designated to one of four
2860    graphic registers (G0 through G3), then these graphic registers are
2861    invoked to GL or GR.  These designations and invocations can be
2862    done independently.  The most common case is that G0 is invoked to
2863    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2864    these invocations and designations are omitted in encoded text.
2865    In a 7-bit environment, only GL can be used.
2866
2867    When a graphic character set of CHARS94 is invoked to GL, codes
2868    0x20 and 0x7F of the GL area work as control characters SPACE and
2869    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2870    be used.
2871
2872    There are two ways of invocation: locking-shift and single-shift.
2873    With locking-shift, the invocation lasts until the next different
2874    invocation, whereas with single-shift, the invocation affects the
2875    following character only and doesn't affect the locking-shift
2876    state.  Invocations are done by the following control characters or
2877    escape sequences:
2878
2879    ----------------------------------------------------------------------
2880    abbrev  function                  cntrl escape seq   description
2881    ----------------------------------------------------------------------
2882    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2883    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2884    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2885    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2886    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2887    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2888    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2889    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2890    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2891    ----------------------------------------------------------------------
2892    (*) These are not used by any known coding system.
2893
2894    Control characters for these functions are defined by macros
2895    ISO_CODE_XXX in `coding.h'.
2896
2897    Designations are done by the following escape sequences:
2898    ----------------------------------------------------------------------
2899    escape sequence      description
2900    ----------------------------------------------------------------------
2901    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2902    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2903    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2904    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2905    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2906    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2907    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2908    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2909    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2910    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2911    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2912    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2913    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2914    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2915    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2916    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2917    ----------------------------------------------------------------------
2918
2919    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2920    of dimension 1, chars 94, and final character <F>, etc...
2921
2922    Note (*): Although these designations are not allowed in ISO2022,
2923    Emacs accepts them on decoding, and produces them on encoding
2924    CHARS96 character sets in a coding system which is characterized as
2925    7-bit environment, non-locking-shift, and non-single-shift.
2926
2927    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2928    '(' must be omitted.  We refer to this as "short-form" hereafter.
2929
2930    Now you may notice that there are a lot of ways of encoding the
2931    same multilingual text in ISO2022.  Actually, there exist many
2932    coding systems such as Compound Text (used in X11's inter client
2933    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2934    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2935    localized platforms), and all of these are variants of ISO2022.
2936
2937    In addition to the above, Emacs handles two more kinds of escape
2938    sequences: ISO6429's direction specification and Emacs' private
2939    sequence for specifying character composition.
2940
2941    ISO6429's direction specification takes the following form:
2942         o CSI ']'      -- end of the current direction
2943         o CSI '0' ']'  -- end of the current direction
2944         o CSI '1' ']'  -- start of left-to-right text
2945         o CSI '2' ']'  -- start of right-to-left text
2946    The control character CSI (0x9B: control sequence introducer) is
2947    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2948
2949    Character composition specification takes the following form:
2950         o ESC '0' -- start relative composition
2951         o ESC '1' -- end composition
2952         o ESC '2' -- start rule-base composition (*)
2953         o ESC '3' -- start relative composition with alternate chars  (**)
2954         o ESC '4' -- start rule-base composition with alternate chars  (**)
2955   Since these are not standard escape sequences of any ISO standard,
2956   the use of them with these meanings is restricted to Emacs only.
2957
2958   (*) This form is used only in Emacs 20.7 and older versions,
2959   but newer versions can safely decode it.
2960   (**) This form is used only in Emacs 21.1 and newer versions,
2961   and older versions can't decode it.
2962
2963   Here's a list of example usages of these composition escape
2964   sequences (categorized by `enum composition_method').
2965
2966   COMPOSITION_RELATIVE:
2967         ESC 0 CHAR [ CHAR ] ESC 1
2968   COMPOSITION_WITH_RULE:
2969         ESC 2 CHAR [ RULE CHAR ] ESC 1
2970   COMPOSITION_WITH_ALTCHARS:
2971         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2972   COMPOSITION_WITH_RULE_ALTCHARS:
2973         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2974
2975 enum iso_code_class_type iso_code_class[256];
2976
2977 #define SAFE_CHARSET_P(coding, id)      \
2978   ((id) <= (coding)->max_charset_id     \
2979    && (coding)->safe_charsets[id] != 255)
2980
2981
2982 #define SHIFT_OUT_OK(category)  \
2983   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2984
2985 static void
2986 setup_iso_safe_charsets (attrs)
2987      Lisp_Object attrs;
2988 {
2989   Lisp_Object charset_list, safe_charsets;
2990   Lisp_Object request;
2991   Lisp_Object reg_usage;
2992   Lisp_Object tail;
2993   int reg94, reg96;
2994   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2995   int max_charset_id;
2996
2997   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2998   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2999       && ! EQ (charset_list, Viso_2022_charset_list))
3000     {
3001       CODING_ATTR_CHARSET_LIST (attrs)
3002         = charset_list = Viso_2022_charset_list;
3003       ASET (attrs, coding_attr_safe_charsets, Qnil);
3004     }
3005
3006   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
3007     return;
3008
3009   max_charset_id = 0;
3010   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3011     {
3012       int id = XINT (XCAR (tail));
3013       if (max_charset_id < id)
3014         max_charset_id = id;
3015     }
3016
3017   safe_charsets = make_uninit_string (max_charset_id + 1);
3018   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
3019   request = AREF (attrs, coding_attr_iso_request);
3020   reg_usage = AREF (attrs, coding_attr_iso_usage);
3021   reg94 = XINT (XCAR (reg_usage));
3022   reg96 = XINT (XCDR (reg_usage));
3023
3024   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
3025     {
3026       Lisp_Object id;
3027       Lisp_Object reg;
3028       struct charset *charset;
3029
3030       id = XCAR (tail);
3031       charset = CHARSET_FROM_ID (XINT (id));
3032       reg = Fcdr (Fassq (id, request));
3033       if (! NILP (reg))
3034         SSET (safe_charsets, XINT (id), XINT (reg));
3035       else if (charset->iso_chars_96)
3036         {
3037           if (reg96 < 4)
3038             SSET (safe_charsets, XINT (id), reg96);
3039         }
3040       else
3041         {
3042           if (reg94 < 4)
3043             SSET (safe_charsets, XINT (id), reg94);
3044         }
3045     }
3046   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
3047 }
3048
3049
3050 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3051    Check if a text is encoded in one of ISO-2022 based codig systems.
3052    If it is, return 1, else return 0.  */
3053
3054 static int
3055 detect_coding_iso_2022 (coding, detect_info)
3056      struct coding_system *coding;
3057      struct coding_detection_info *detect_info;
3058 {
3059   const unsigned char *src = coding->source, *src_base = src;
3060   const unsigned char *src_end = coding->source + coding->src_bytes;
3061   int multibytep = coding->src_multibyte;
3062   int single_shifting = 0;
3063   int id;
3064   int c, c1;
3065   int consumed_chars = 0;
3066   int i;
3067   int rejected = 0;
3068   int found = 0;
3069   int composition_count = -1;
3070
3071   detect_info->checked |= CATEGORY_MASK_ISO;
3072
3073   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
3074     {
3075       struct coding_system *this = &(coding_categories[i]);
3076       Lisp_Object attrs, val;
3077
3078       if (this->id < 0)
3079         continue;
3080       attrs = CODING_ID_ATTRS (this->id);
3081       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
3082           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Viso_2022_charset_list))
3083         setup_iso_safe_charsets (attrs);
3084       val = CODING_ATTR_SAFE_CHARSETS (attrs);
3085       this->max_charset_id = SCHARS (val) - 1;
3086       this->safe_charsets = SDATA (val);
3087     }
3088
3089   /* A coding system of this category is always ASCII compatible.  */
3090   src += coding->head_ascii;
3091
3092   while (rejected != CATEGORY_MASK_ISO)
3093     {
3094       src_base = src;
3095       ONE_MORE_BYTE (c);
3096       switch (c)
3097         {
3098         case ISO_CODE_ESC:
3099           if (inhibit_iso_escape_detection)
3100             break;
3101           single_shifting = 0;
3102           ONE_MORE_BYTE (c);
3103           if (c >= '(' && c <= '/')
3104             {
3105               /* Designation sequence for a charset of dimension 1.  */
3106               ONE_MORE_BYTE (c1);
3107               if (c1 < ' ' || c1 >= 0x80
3108                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
3109                 /* Invalid designation sequence.  Just ignore.  */
3110                 break;
3111             }
3112           else if (c == '$')
3113             {
3114               /* Designation sequence for a charset of dimension 2.  */
3115               ONE_MORE_BYTE (c);
3116               if (c >= '@' && c <= 'B')
3117                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
3118                 id = iso_charset_table[1][0][c];
3119               else if (c >= '(' && c <= '/')
3120                 {
3121                   ONE_MORE_BYTE (c1);
3122                   if (c1 < ' ' || c1 >= 0x80
3123                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
3124                     /* Invalid designation sequence.  Just ignore.  */
3125                     break;
3126                 }
3127               else
3128                 /* Invalid designation sequence.  Just ignore it.  */
3129                 break;
3130             }
3131           else if (c == 'N' || c == 'O')
3132             {
3133               /* ESC <Fe> for SS2 or SS3.  */
3134               single_shifting = 1;
3135               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3136               break;
3137             }
3138           else if (c == '1')
3139             {
3140               /* End of composition.  */
3141               if (composition_count < 0
3142                   || composition_count > MAX_COMPOSITION_COMPONENTS)
3143                 /* Invalid */
3144                 break;
3145               composition_count = -1;
3146               found |= CATEGORY_MASK_ISO;
3147             }
3148           else if (c >= '0' && c <= '4')
3149             {
3150               /* ESC <Fp> for start/end composition.  */
3151               composition_count = 0;
3152               break;
3153             }
3154           else
3155             {
3156               /* Invalid escape sequence.  Just ignore it.  */
3157               break;
3158             }
3159
3160           /* We found a valid designation sequence for CHARSET.  */
3161           rejected |= CATEGORY_MASK_ISO_8BIT;
3162           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
3163                               id))
3164             found |= CATEGORY_MASK_ISO_7;
3165           else
3166             rejected |= CATEGORY_MASK_ISO_7;
3167           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
3168                               id))
3169             found |= CATEGORY_MASK_ISO_7_TIGHT;
3170           else
3171             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
3172           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
3173                               id))
3174             found |= CATEGORY_MASK_ISO_7_ELSE;
3175           else
3176             rejected |= CATEGORY_MASK_ISO_7_ELSE;
3177           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
3178                               id))
3179             found |= CATEGORY_MASK_ISO_8_ELSE;
3180           else
3181             rejected |= CATEGORY_MASK_ISO_8_ELSE;
3182           break;
3183
3184         case ISO_CODE_SO:
3185         case ISO_CODE_SI:
3186           /* Locking shift out/in.  */
3187           if (inhibit_iso_escape_detection)
3188             break;
3189           single_shifting = 0;
3190           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
3191           break;
3192
3193         case ISO_CODE_CSI:
3194           /* Control sequence introducer.  */
3195           single_shifting = 0;
3196           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3197           found |= CATEGORY_MASK_ISO_8_ELSE;
3198           goto check_extra_latin;
3199
3200         case ISO_CODE_SS2:
3201         case ISO_CODE_SS3:
3202           /* Single shift.   */
3203           if (inhibit_iso_escape_detection)
3204             break;
3205           single_shifting = 0;
3206           rejected |= CATEGORY_MASK_ISO_7BIT;
3207           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3208               & CODING_ISO_FLAG_SINGLE_SHIFT)
3209             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
3210           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
3211               & CODING_ISO_FLAG_SINGLE_SHIFT)
3212             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
3213           if (single_shifting)
3214             break;
3215           goto check_extra_latin;
3216
3217         default:
3218           if (c < 0)
3219             continue;
3220           if (c < 0x80)
3221             {
3222               if (composition_count >= 0)
3223                 composition_count++;
3224               single_shifting = 0;
3225               break;
3226             }
3227           if (c >= 0xA0)
3228             {
3229               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
3230               found |= CATEGORY_MASK_ISO_8_1;
3231               /* Check the length of succeeding codes of the range
3232                  0xA0..0FF.  If the byte length is even, we include
3233                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
3234                  only when we are not single shifting.  */
3235               if (! single_shifting
3236                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
3237                 {
3238                   int i = 1;
3239                   while (src < src_end)
3240                     {
3241                       ONE_MORE_BYTE (c);
3242                       if (c < 0xA0)
3243                         break;
3244                       i++;
3245                     }
3246
3247                   if (i & 1 && src < src_end)
3248                     {
3249                       rejected |= CATEGORY_MASK_ISO_8_2;
3250                       if (composition_count >= 0)
3251                         composition_count += i;
3252                     }
3253                   else
3254                     {
3255                       found |= CATEGORY_MASK_ISO_8_2;
3256                       if (composition_count >= 0)
3257                         composition_count += i / 2;
3258                     }
3259                 }
3260               break;
3261             }
3262         check_extra_latin:
3263           single_shifting = 0;
3264           if (! VECTORP (Vlatin_extra_code_table)
3265               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
3266             {
3267               rejected = CATEGORY_MASK_ISO;
3268               break;
3269             }
3270           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
3271               & CODING_ISO_FLAG_LATIN_EXTRA)
3272             found |= CATEGORY_MASK_ISO_8_1;
3273           else
3274             rejected |= CATEGORY_MASK_ISO_8_1;
3275           rejected |= CATEGORY_MASK_ISO_8_2;
3276         }
3277     }
3278   detect_info->rejected |= CATEGORY_MASK_ISO;
3279   return 0;
3280
3281  no_more_source:
3282   detect_info->rejected |= rejected;
3283   detect_info->found |= (found & ~rejected);
3284   return 1;
3285 }
3286
3287
3288 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
3289    escape sequence should be kept.  */
3290 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
3291   do {                                                                  \
3292     int id, prev;                                                       \
3293                                                                         \
3294     if (final < '0' || final >= 128                                     \
3295         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
3296         || !SAFE_CHARSET_P (coding, id))                                \
3297       {                                                                 \
3298         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
3299         chars_96 = -1;                                                  \
3300         break;                                                          \
3301       }                                                                 \
3302     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
3303     if (id == charset_jisx0201_roman)                                   \
3304       {                                                                 \
3305         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
3306           id = charset_ascii;                                           \
3307       }                                                                 \
3308     else if (id == charset_jisx0208_1978)                               \
3309       {                                                                 \
3310         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
3311           id = charset_jisx0208;                                        \
3312       }                                                                 \
3313     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
3314     /* If there was an invalid designation to REG previously, and this  \
3315        designation is ASCII to REG, we should keep this designation     \
3316        sequence.  */                                                    \
3317     if (prev == -2 && id == charset_ascii)                              \
3318       chars_96 = -1;                                                    \
3319   } while (0)
3320
3321
3322 /* Handle these composition sequence (ALT: alternate char):
3323
3324    (1) relative composition: ESC 0 CHAR ... ESC 1
3325    (2) rulebase composition: ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3326    (3) altchar composition:  ESC 3 ALT ... ALT ESC 0 CHAR ... ESC 1
3327    (4) alt&rule composition: ESC 4 ALT RULE ... ALT ESC 0 CHAR ... ESC 1
3328
3329    When the start sequence (ESC 0/2/3/4) is found, this annotation
3330    header is produced.
3331
3332         [ -LENGTH(==-5) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) 0 METHOD ]
3333
3334    Then, upon reading CHAR or RULE (one or two bytes), these codes are
3335    produced until the end sequence (ESC 1) is found:
3336
3337    (1) CHAR ... CHAR
3338    (2) CHAR -2 DECODED-RULE CHAR -2 DECODED-RULE ... CHAR
3339    (3) ALT ... ALT -1 -1 CHAR ... CHAR
3340    (4) ALT -2 DECODED-RULE ALT -2 DECODED-RULE ... ALT -1 -1 CHAR ... CHAR
3341
3342    When the end sequence (ESC 1) is found, LENGTH and NCHARS in the
3343    annotation header is updated as below:
3344
3345    (1) LENGTH: unchanged,  NCHARS: number of CHARs
3346    (2) LENGTH: unchanged,  NCHARS: number of CHARs
3347    (3) LENGTH: += number of ALTs + 2,  NCHARS: number of CHARs
3348    (4) LENGTH: += number of ALTs * 3,  NCHARS: number of CHARs
3349
3350    If an error is found while composing, the annotation header is
3351    changed to:
3352
3353         [ ESC '0'/'2'/'3'/'4' -2 0 ]
3354
3355    and the sequence [ -2 DECODED-RULE ] is changed to the original
3356    byte sequence as below:
3357         o the original byte sequence is B: [ B -1 ]
3358         o the original byte sequence is B1 B2: [ B1 B2 ]
3359    and the sequence [ -1 -1 ] is changed to the original byte
3360    sequence:
3361         [ ESC '0' ]
3362 */
3363
3364 /* Decode a composition rule C1 and maybe one more byte from the
3365    source, and set RULE to the encoded composition rule, NBYTES to the
3366    length of the composition rule.  If the rule is invalid, set RULE
3367    to some negative value.  */
3368
3369 #define DECODE_COMPOSITION_RULE(rule, nbytes)                           \
3370   do {                                                                  \
3371     rule = c1 - 32;                                                     \
3372     if (rule < 0)                                                       \
3373       break;                                                            \
3374     if (rule < 81)              /* old format (before ver.21) */        \
3375       {                                                                 \
3376         int gref = (rule) / 9;                                          \
3377         int nref = (rule) % 9;                                          \
3378         if (gref == 4) gref = 10;                                       \
3379         if (nref == 4) nref = 10;                                       \
3380         rule = COMPOSITION_ENCODE_RULE (gref, nref);                    \
3381         nbytes = 1;                                                     \
3382       }                                                                 \
3383     else                        /* new format (after ver.21) */         \
3384       {                                                                 \
3385         int c;                                                          \
3386                                                                         \
3387         ONE_MORE_BYTE (c);                                              \
3388         rule = COMPOSITION_ENCODE_RULE (rule - 81, c - 32);             \
3389         if (rule >= 0)                                                  \
3390           rule += 0x100;   /* to destinguish it from the old format */  \
3391         nbytes = 2;                                                     \
3392       }                                                                 \
3393   } while (0)
3394
3395 #define ENCODE_COMPOSITION_RULE(rule)                           \
3396   do {                                                          \
3397     int gref = (rule % 0x100) / 12, nref = (rule % 0x100) % 12; \
3398                                                                 \
3399     if (rule < 0x100)           /* old format */                \
3400       {                                                         \
3401         if (gref == 10) gref = 4;                               \
3402         if (nref == 10) nref = 4;                               \
3403         charbuf[idx] = 32 + gref * 9 + nref;                    \
3404         charbuf[idx + 1] = -1;                                  \
3405         new_chars++;                                            \
3406       }                                                         \
3407     else                                /* new format */        \
3408       {                                                         \
3409         charbuf[idx] = 32 + 81 + gref;                          \
3410         charbuf[idx + 1] = 32 + nref;                           \
3411         new_chars += 2;                                         \
3412       }                                                         \
3413   } while (0)
3414
3415 /* Finish the current composition as invalid.  */
3416
3417 static int finish_composition P_ ((int *, struct composition_status *));
3418
3419 static int
3420 finish_composition (charbuf, cmp_status)
3421      int *charbuf;
3422      struct composition_status *cmp_status;
3423 {
3424   int idx = - cmp_status->length;
3425   int new_chars;
3426
3427   /* Recover the original ESC sequence */
3428   charbuf[idx++] = ISO_CODE_ESC;
3429   charbuf[idx++] = (cmp_status->method == COMPOSITION_RELATIVE ? '0'
3430                     : cmp_status->method == COMPOSITION_WITH_RULE ? '2'
3431                     : cmp_status->method == COMPOSITION_WITH_ALTCHARS ? '3'
3432                     /* cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS */
3433                     : '4');
3434   charbuf[idx++] = -2;
3435   charbuf[idx++] = 0;
3436   charbuf[idx++] = -1;
3437   new_chars = cmp_status->nchars;
3438   if (cmp_status->method >= COMPOSITION_WITH_RULE)
3439     for (; idx < 0; idx++)
3440       {
3441         int elt = charbuf[idx];
3442
3443         if (elt == -2)
3444           {
3445             ENCODE_COMPOSITION_RULE (charbuf[idx + 1]);
3446             idx++;
3447           }
3448         else if (elt == -1)
3449           {
3450             charbuf[idx++] = ISO_CODE_ESC;
3451             charbuf[idx] = '0';
3452             new_chars += 2;
3453           }
3454       }
3455   cmp_status->state = COMPOSING_NO;
3456   return new_chars;
3457 }
3458
3459 /* If characers are under composition, finish the composition.  */
3460 #define MAYBE_FINISH_COMPOSITION()                              \
3461   do {                                                          \
3462     if (cmp_status->state != COMPOSING_NO)                      \
3463       char_offset += finish_composition (charbuf, cmp_status);  \
3464   } while (0)
3465
3466 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
3467
3468    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
3469    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
3470    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
3471    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
3472
3473    Produce this annotation sequence now:
3474
3475    [ -LENGTH(==-4) CODING_ANNOTATE_COMPOSITION_MASK NCHARS(==0) METHOD ]
3476 */
3477
3478 #define DECODE_COMPOSITION_START(c1)                                       \
3479   do {                                                                     \
3480     if (c1 == '0'                                                          \
3481         && ((cmp_status->state == COMPOSING_COMPONENT_CHAR                 \
3482              && cmp_status->method == COMPOSITION_WITH_ALTCHARS)           \
3483             || (cmp_status->state == COMPOSING_COMPONENT_RULE              \
3484                 && cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS))) \
3485       {                                                                    \
3486         *charbuf++ = -1;                                                   \
3487         *charbuf++= -1;                                                    \
3488         cmp_status->state = COMPOSING_CHAR;                                \
3489         cmp_status->length += 2;                                           \
3490       }                                                                    \
3491     else                                                                   \
3492       {                                                                    \
3493         MAYBE_FINISH_COMPOSITION ();                                       \
3494         cmp_status->method = (c1 == '0' ? COMPOSITION_RELATIVE             \
3495                               : c1 == '2' ? COMPOSITION_WITH_RULE          \
3496                               : c1 == '3' ? COMPOSITION_WITH_ALTCHARS      \
3497                               : COMPOSITION_WITH_RULE_ALTCHARS);           \
3498         cmp_status->state                                                  \
3499           = (c1 <= '2' ? COMPOSING_CHAR : COMPOSING_COMPONENT_CHAR);       \
3500         ADD_COMPOSITION_DATA (charbuf, 0, 0, cmp_status->method);          \
3501         cmp_status->length = MAX_ANNOTATION_LENGTH;                        \
3502         cmp_status->nchars = cmp_status->ncomps = 0;                       \
3503         coding->annotated = 1;                                             \
3504       }                                                                    \
3505   } while (0)
3506
3507
3508 /* Handle composition end sequence ESC 1.  */
3509
3510 #define DECODE_COMPOSITION_END()                                        \
3511   do {                                                                  \
3512     if (cmp_status->nchars == 0                                         \
3513         || ((cmp_status->state == COMPOSING_CHAR)                       \
3514             == (cmp_status->method == COMPOSITION_WITH_RULE)))          \
3515       {                                                                 \
3516         MAYBE_FINISH_COMPOSITION ();                                    \
3517         goto invalid_code;                                              \
3518       }                                                                 \
3519     if (cmp_status->method == COMPOSITION_WITH_ALTCHARS)                \
3520       charbuf[- cmp_status->length] -= cmp_status->ncomps + 2;          \
3521     else if (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS)      \
3522       charbuf[- cmp_status->length] -= cmp_status->ncomps * 3;          \
3523     charbuf[- cmp_status->length + 2] = cmp_status->nchars;             \
3524     char_offset += cmp_status->nchars;                                  \
3525     cmp_status->state = COMPOSING_NO;                                   \
3526   } while (0)
3527
3528 /* Store a composition rule RULE in charbuf, and update cmp_status.  */
3529
3530 #define STORE_COMPOSITION_RULE(rule)    \
3531   do {                                  \
3532     *charbuf++ = -2;                    \
3533     *charbuf++ = rule;                  \
3534     cmp_status->length += 2;            \
3535     cmp_status->state--;                \
3536   } while (0)
3537
3538 /* Store a composed char or a component char C in charbuf, and update
3539    cmp_status.  */
3540
3541 #define STORE_COMPOSITION_CHAR(c)                                       \
3542   do {                                                                  \
3543     *charbuf++ = (c);                                                   \
3544     cmp_status->length++;                                               \
3545     if (cmp_status->state == COMPOSING_CHAR)                            \
3546       cmp_status->nchars++;                                             \
3547     else                                                                \
3548       cmp_status->ncomps++;                                             \
3549     if (cmp_status->method == COMPOSITION_WITH_RULE                     \
3550         || (cmp_status->method == COMPOSITION_WITH_RULE_ALTCHARS        \
3551             && cmp_status->state == COMPOSING_COMPONENT_CHAR))          \
3552       cmp_status->state++;                                              \
3553   } while (0)
3554
3555
3556 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
3557
3558 static void
3559 decode_coding_iso_2022 (coding)
3560      struct coding_system *coding;
3561 {
3562   const unsigned char *src = coding->source + coding->consumed;
3563   const unsigned char *src_end = coding->source + coding->src_bytes;
3564   const unsigned char *src_base;
3565   int *charbuf = coding->charbuf + coding->charbuf_used;
3566   /* We may produce two annocations (charset and composition) in one
3567      loop and one more charset annocation at the end.  */
3568   int *charbuf_end
3569     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 3);
3570   int consumed_chars = 0, consumed_chars_base;
3571   int multibytep = coding->src_multibyte;
3572   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
3573   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3574   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3575   int charset_id_2, charset_id_3;
3576   struct charset *charset;
3577   int c;
3578   struct composition_status *cmp_status = CODING_ISO_CMP_STATUS (coding);
3579   Lisp_Object attrs, charset_list;
3580   int char_offset = coding->produced_char;
3581   int last_offset = char_offset;
3582   int last_id = charset_ascii;
3583   int eol_crlf =
3584     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
3585   int byte_after_cr = -1;
3586   int i;
3587
3588   CODING_GET_INFO (coding, attrs, charset_list);
3589   setup_iso_safe_charsets (attrs);
3590   /* Charset list may have been changed.  */
3591   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
3593
3594   if (cmp_status->state != COMPOSING_NO)
3595     {
3596       for (i = 0; i < cmp_status->length; i++)
3597         *charbuf++ = cmp_status->carryover[i];
3598       coding->annotated = 1;
3599     }
3600
3601   while (1)
3602     {
3603       int c1, c2, c3;
3604
3605       src_base = src;
3606       consumed_chars_base = consumed_chars;
3607
3608       if (charbuf >= charbuf_end)
3609         {
3610           if (byte_after_cr >= 0)
3611             src_base--;
3612           break;
3613         }
3614
3615       if (byte_after_cr >= 0)
3616         c1 = byte_after_cr, byte_after_cr = -1;
3617       else
3618         ONE_MORE_BYTE (c1);
3619       if (c1 < 0)
3620         goto invalid_code;
3621
3622       if (CODING_ISO_EXTSEGMENT_LEN (coding) > 0)
3623         {
3624           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3625           char_offset++;
3626           CODING_ISO_EXTSEGMENT_LEN (coding)--;
3627           continue;
3628         }
3629
3630       if (CODING_ISO_EMBEDDED_UTF_8 (coding))
3631         {
3632           if (c1 == ISO_CODE_ESC)
3633             {
3634               if (src + 1 >= src_end)
3635                 goto no_more_source;
3636               *charbuf++ = ISO_CODE_ESC;
3637               char_offset++;
3638               if (src[0] == '%' && src[1] == '@')
3639                 {
3640                   src += 2;
3641                   consumed_chars += 2;
3642                   char_offset += 2;
3643                   /* We are sure charbuf can contain two more chars. */
3644                   *charbuf++ = '%';
3645                   *charbuf++ = '@';
3646                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
3647                 }
3648             }
3649           else
3650             {
3651               *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3652               char_offset++;
3653             }
3654           continue;
3655         }
3656
3657       if ((cmp_status->state == COMPOSING_RULE
3658            || cmp_status->state == COMPOSING_COMPONENT_RULE)
3659           && c1 != ISO_CODE_ESC)
3660         {
3661           int rule, nbytes;
3662
3663           DECODE_COMPOSITION_RULE (rule, nbytes);
3664           if (rule < 0)
3665             goto invalid_code;
3666           STORE_COMPOSITION_RULE (rule);
3667           continue;
3668         }
3669
3670       /* We produce at most one character.  */
3671       switch (iso_code_class [c1])
3672         {
3673         case ISO_0x20_or_0x7F:
3674           if (charset_id_0 < 0
3675               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
3676             /* This is SPACE or DEL.  */
3677             charset = CHARSET_FROM_ID (charset_ascii);
3678           else
3679             charset = CHARSET_FROM_ID (charset_id_0);
3680           break;
3681
3682         case ISO_graphic_plane_0:
3683           if (charset_id_0 < 0)
3684             charset = CHARSET_FROM_ID (charset_ascii);
3685           else
3686             charset = CHARSET_FROM_ID (charset_id_0);
3687           break;
3688
3689         case ISO_0xA0_or_0xFF:
3690           if (charset_id_1 < 0
3691               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3692               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3693             goto invalid_code;
3694           /* This is a graphic character, we fall down ... */
3695
3696         case ISO_graphic_plane_1:
3697           if (charset_id_1 < 0)
3698             goto invalid_code;
3699           charset = CHARSET_FROM_ID (charset_id_1);
3700           break;
3701
3702         case ISO_control_0:
3703           if (eol_crlf && c1 == '\r')
3704             ONE_MORE_BYTE (byte_after_cr);
3705           MAYBE_FINISH_COMPOSITION ();
3706           charset = CHARSET_FROM_ID (charset_ascii);
3707           break;
3708
3709         case ISO_control_1:
3710           goto invalid_code;
3711
3712         case ISO_shift_out:
3713           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3714               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3715             goto invalid_code;
3716           CODING_ISO_INVOCATION (coding, 0) = 1;
3717           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3718           continue;
3719
3720         case ISO_shift_in:
3721           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3722             goto invalid_code;
3723           CODING_ISO_INVOCATION (coding, 0) = 0;
3724           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3725           continue;
3726
3727         case ISO_single_shift_2_7:
3728         case ISO_single_shift_2:
3729           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3730             goto invalid_code;
3731           /* SS2 is handled as an escape sequence of ESC 'N' */
3732           c1 = 'N';
3733           goto label_escape_sequence;
3734
3735         case ISO_single_shift_3:
3736           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3737             goto invalid_code;
3738           /* SS2 is handled as an escape sequence of ESC 'O' */
3739           c1 = 'O';
3740           goto label_escape_sequence;
3741
3742         case ISO_control_sequence_introducer:
3743           /* CSI is handled as an escape sequence of ESC '[' ...  */
3744           c1 = '[';
3745           goto label_escape_sequence;
3746
3747         case ISO_escape:
3748           ONE_MORE_BYTE (c1);
3749         label_escape_sequence:
3750           /* Escape sequences handled here are invocation,
3751              designation, direction specification, and character
3752              composition specification.  */
3753           switch (c1)
3754             {
3755             case '&':           /* revision of following character set */
3756               ONE_MORE_BYTE (c1);
3757               if (!(c1 >= '@' && c1 <= '~'))
3758                 goto invalid_code;
3759               ONE_MORE_BYTE (c1);
3760               if (c1 != ISO_CODE_ESC)
3761                 goto invalid_code;
3762               ONE_MORE_BYTE (c1);
3763               goto label_escape_sequence;
3764
3765             case '$':           /* designation of 2-byte character set */
3766               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3767                 goto invalid_code;
3768               {
3769                 int reg, chars96;
3770
3771                 ONE_MORE_BYTE (c1);
3772                 if (c1 >= '@' && c1 <= 'B')
3773                   {     /* designation of JISX0208.1978, GB2312.1980,
3774                            or JISX0208.1980 */
3775                     reg = 0, chars96 = 0;
3776                   }
3777                 else if (c1 >= 0x28 && c1 <= 0x2B)
3778                   { /* designation of DIMENSION2_CHARS94 character set */
3779                     reg = c1 - 0x28, chars96 = 0;
3780                     ONE_MORE_BYTE (c1);
3781                   }
3782                 else if (c1 >= 0x2C && c1 <= 0x2F)
3783                   { /* designation of DIMENSION2_CHARS96 character set */
3784                     reg = c1 - 0x2C, chars96 = 1;
3785                     ONE_MORE_BYTE (c1);
3786                   }
3787                 else
3788                   goto invalid_code;
3789                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3790                 /* We must update these variables now.  */
3791                 if (reg == 0)
3792                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3793                 else if (reg == 1)
3794                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3795                 if (chars96 < 0)
3796                   goto invalid_code;
3797               }
3798               continue;
3799
3800             case 'n':           /* invocation of locking-shift-2 */
3801               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3802                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3803                 goto invalid_code;
3804               CODING_ISO_INVOCATION (coding, 0) = 2;
3805               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3806               continue;
3807
3808             case 'o':           /* invocation of locking-shift-3 */
3809               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3810                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3811                 goto invalid_code;
3812               CODING_ISO_INVOCATION (coding, 0) = 3;
3813               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3814               continue;
3815
3816             case 'N':           /* invocation of single-shift-2 */
3817               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3818                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3819                 goto invalid_code;
3820               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3821               if (charset_id_2 < 0)
3822                 charset = CHARSET_FROM_ID (charset_ascii);
3823               else
3824                 charset = CHARSET_FROM_ID (charset_id_2);
3825               ONE_MORE_BYTE (c1);
3826               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3827                 goto invalid_code;
3828               break;
3829
3830             case 'O':           /* invocation of single-shift-3 */
3831               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3832                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3833                 goto invalid_code;
3834               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3835               if (charset_id_3 < 0)
3836                 charset = CHARSET_FROM_ID (charset_ascii);
3837               else
3838                 charset = CHARSET_FROM_ID (charset_id_3);
3839               ONE_MORE_BYTE (c1);
3840               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3841                 goto invalid_code;
3842               break;
3843
3844             case '0': case '2': case '3': case '4': /* start composition */
3845               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3846                 goto invalid_code;
3847               if (last_id != charset_ascii)
3848                 {
3849                   ADD_CHARSET_DATA (charbuf, char_offset- last_offset, last_id);
3850                   last_id = charset_ascii;
3851                   last_offset = char_offset;
3852                 }
3853               DECODE_COMPOSITION_START (c1);
3854               continue;
3855
3856             case '1':           /* end composition */
3857               if (cmp_status->state == COMPOSING_NO)
3858                 goto invalid_code;
3859               DECODE_COMPOSITION_END ();
3860               continue;
3861
3862             case '[':           /* specification of direction */
3863               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3864                 goto invalid_code;
3865               /* For the moment, nested direction is not supported.
3866                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3867                  left-to-right, and nozero means right-to-left.  */
3868               ONE_MORE_BYTE (c1);
3869               switch (c1)
3870                 {
3871                 case ']':       /* end of the current direction */
3872                   coding->mode &= ~CODING_MODE_DIRECTION;
3873
3874                 case '0':       /* end of the current direction */
3875                 case '1':       /* start of left-to-right direction */
3876                   ONE_MORE_BYTE (c1);
3877                   if (c1 == ']')
3878                     coding->mode &= ~CODING_MODE_DIRECTION;
3879                   else
3880                     goto invalid_code;
3881                   break;
3882
3883                 case '2':       /* start of right-to-left direction */
3884                   ONE_MORE_BYTE (c1);
3885                   if (c1 == ']')
3886                     coding->mode |= CODING_MODE_DIRECTION;
3887                   else
3888                     goto invalid_code;
3889                   break;
3890
3891                 default:
3892                   goto invalid_code;
3893                 }
3894               continue;
3895
3896             case '%':
3897               ONE_MORE_BYTE (c1);
3898               if (c1 == '/')
3899                 {
3900                   /* CTEXT extended segment:
3901                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3902                      We keep these bytes as is for the moment.
3903                      They may be decoded by post-read-conversion.  */
3904                   int dim, M, L;
3905                   int size;
3906
3907                   ONE_MORE_BYTE (dim);
3908                   if (dim < 0 || dim > 4)
3909                     goto invalid_code;
3910                   ONE_MORE_BYTE (M);
3911                   if (M < 128)
3912                     goto invalid_code;
3913                   ONE_MORE_BYTE (L);
3914                   if (L < 128)
3915                     goto invalid_code;
3916                   size = ((M - 128) * 128) + (L - 128);
3917                   if (charbuf + 6 > charbuf_end)
3918                     goto break_loop;
3919                   *charbuf++ = ISO_CODE_ESC;
3920                   *charbuf++ = '%';
3921                   *charbuf++ = '/';
3922                   *charbuf++ = dim;
3923                   *charbuf++ = BYTE8_TO_CHAR (M);
3924                   *charbuf++ = BYTE8_TO_CHAR (L);
3925                   CODING_ISO_EXTSEGMENT_LEN (coding) = size;
3926                 }
3927               else if (c1 == 'G')
3928                 {
3929                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3930                      ESC % G --UTF-8-BYTES-- ESC % @
3931                      We keep these bytes as is for the moment.
3932                      They may be decoded by post-read-conversion.  */
3933                   if (charbuf + 3 > charbuf_end)
3934                     goto break_loop;
3935                   *charbuf++ = ISO_CODE_ESC;
3936                   *charbuf++ = '%';
3937                   *charbuf++ = 'G';
3938                   CODING_ISO_EMBEDDED_UTF_8 (coding) = 1;
3939                 }
3940               else
3941                 goto invalid_code;
3942               continue;
3943               break;
3944
3945             default:
3946               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3947                 goto invalid_code;
3948               {
3949                 int reg, chars96;
3950
3951                 if (c1 >= 0x28 && c1 <= 0x2B)
3952                   { /* designation of DIMENSION1_CHARS94 character set */
3953                     reg = c1 - 0x28, chars96 = 0;
3954                     ONE_MORE_BYTE (c1);
3955                   }
3956                 else if (c1 >= 0x2C && c1 <= 0x2F)
3957                   { /* designation of DIMENSION1_CHARS96 character set */
3958                     reg = c1 - 0x2C, chars96 = 1;
3959                     ONE_MORE_BYTE (c1);
3960                   }
3961                 else
3962                   goto invalid_code;
3963                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3964                 /* We must update these variables now.  */
3965                 if (reg == 0)
3966                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3967                 else if (reg == 1)
3968                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3969                 if (chars96 < 0)
3970                   goto invalid_code;
3971               }
3972               continue;
3973             }
3974         }
3975
3976       if (cmp_status->state == COMPOSING_NO
3977           && charset->id != charset_ascii
3978           && last_id != charset->id)
3979         {
3980           if (last_id != charset_ascii)
3981             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3982           last_id = charset->id;
3983           last_offset = char_offset;
3984         }
3985
3986       /* Now we know CHARSET and 1st position code C1 of a character.
3987          Produce a decoded character while getting 2nd and 3rd
3988          position codes C2, C3 if necessary.  */
3989       if (CHARSET_DIMENSION (charset) > 1)
3990         {
3991           ONE_MORE_BYTE (c2);
3992           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0)
3993               || ((c1 & 0x80) != (c2 & 0x80)))
3994             /* C2 is not in a valid range.  */
3995             goto invalid_code;
3996           if (CHARSET_DIMENSION (charset) == 2)
3997             c1 = (c1 << 8) | c2;
3998           else
3999             {
4000               ONE_MORE_BYTE (c3);
4001               if (c3 < 0x20 || (c3 >= 0x80 && c3 < 0xA0)
4002                   || ((c1 & 0x80) != (c3 & 0x80)))
4003                 /* C3 is not in a valid range.  */
4004                 goto invalid_code;
4005               c1 = (c1 << 16) | (c2 << 8) | c2;
4006             }
4007         }
4008       c1 &= 0x7F7F7F;
4009       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
4010       if (c < 0)
4011         {
4012           MAYBE_FINISH_COMPOSITION ();
4013           for (; src_base < src; src_base++, char_offset++)
4014             {
4015               if (ASCII_BYTE_P (*src_base))
4016                 *charbuf++ = *src_base;
4017               else
4018                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
4019             }
4020         }
4021       else if (cmp_status->state == COMPOSING_NO)
4022         {
4023           *charbuf++ = c;
4024           char_offset++;
4025         }
4026       else if ((cmp_status->state == COMPOSING_CHAR
4027                 ? cmp_status->nchars
4028                 : cmp_status->ncomps)
4029                >= MAX_COMPOSITION_COMPONENTS)
4030         {
4031           /* Too long composition.  */
4032           MAYBE_FINISH_COMPOSITION ();
4033           *charbuf++ = c;
4034           char_offset++;
4035         }
4036       else
4037         STORE_COMPOSITION_CHAR (c);
4038       continue;
4039
4040     invalid_code:
4041       MAYBE_FINISH_COMPOSITION ();
4042       src = src_base;
4043       consumed_chars = consumed_chars_base;
4044       ONE_MORE_BYTE (c);
4045       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4046       char_offset++;
4047       coding->errors++;
4048       continue;
4049
4050     break_loop:
4051       break;
4052     }
4053
4054  no_more_source:
4055   if (cmp_status->state != COMPOSING_NO)
4056     {
4057       if (coding->mode & CODING_MODE_LAST_BLOCK)
4058         MAYBE_FINISH_COMPOSITION ();
4059       else
4060         {
4061           charbuf -= cmp_status->length;
4062           for (i = 0; i < cmp_status->length; i++)
4063             cmp_status->carryover[i] = charbuf[i];
4064         }
4065     }
4066   else if (last_id != charset_ascii)
4067     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4068   coding->consumed_char += consumed_chars_base;
4069   coding->consumed = src_base - coding->source;
4070   coding->charbuf_used = charbuf - coding->charbuf;
4071 }
4072
4073
4074 /* ISO2022 encoding stuff.  */
4075
4076 /*
4077    It is not enough to say just "ISO2022" on encoding, we have to
4078    specify more details.  In Emacs, each coding system of ISO2022
4079    variant has the following specifications:
4080         1. Initial designation to G0 thru G3.
4081         2. Allows short-form designation?
4082         3. ASCII should be designated to G0 before control characters?
4083         4. ASCII should be designated to G0 at end of line?
4084         5. 7-bit environment or 8-bit environment?
4085         6. Use locking-shift?
4086         7. Use Single-shift?
4087    And the following two are only for Japanese:
4088         8. Use ASCII in place of JIS0201-1976-Roman?
4089         9. Use JISX0208-1983 in place of JISX0208-1978?
4090    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
4091    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
4092    details.
4093 */
4094
4095 /* Produce codes (escape sequence) for designating CHARSET to graphic
4096    register REG at DST, and increment DST.  If <final-char> of CHARSET is
4097    '@', 'A', or 'B' and the coding system CODING allows, produce
4098    designation sequence of short-form.  */
4099
4100 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
4101   do {                                                                  \
4102     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
4103     char *intermediate_char_94 = "()*+";                                \
4104     char *intermediate_char_96 = ",-./";                                \
4105     int revision = -1;                                                  \
4106     int c;                                                              \
4107                                                                         \
4108     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
4109       revision = CHARSET_ISO_REVISION (charset);                        \
4110                                                                         \
4111     if (revision >= 0)                                                  \
4112       {                                                                 \
4113         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
4114         EMIT_ONE_BYTE ('@' + revision);                                 \
4115       }                                                                 \
4116     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
4117     if (CHARSET_DIMENSION (charset) == 1)                               \
4118       {                                                                 \
4119         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4120           c = intermediate_char_94[reg];                                \
4121         else                                                            \
4122           c = intermediate_char_96[reg];                                \
4123         EMIT_ONE_ASCII_BYTE (c);                                        \
4124       }                                                                 \
4125     else                                                                \
4126       {                                                                 \
4127         EMIT_ONE_ASCII_BYTE ('$');                                      \
4128         if (! CHARSET_ISO_CHARS_96 (charset))                           \
4129           {                                                             \
4130             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
4131                 || reg != 0                                             \
4132                 || final_char < '@' || final_char > 'B')                \
4133               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
4134           }                                                             \
4135         else                                                            \
4136           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
4137       }                                                                 \
4138     EMIT_ONE_ASCII_BYTE (final_char);                                   \
4139                                                                         \
4140     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
4141   } while (0)
4142
4143
4144 /* The following two macros produce codes (control character or escape
4145    sequence) for ISO2022 single-shift functions (single-shift-2 and
4146    single-shift-3).  */
4147
4148 #define ENCODE_SINGLE_SHIFT_2                                           \
4149   do {                                                                  \
4150     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4151       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
4152     else                                                                \
4153       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
4154     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4155   } while (0)
4156
4157
4158 #define ENCODE_SINGLE_SHIFT_3                                           \
4159   do {                                                                  \
4160     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
4161       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
4162     else                                                                \
4163       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
4164     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
4165   } while (0)
4166
4167
4168 /* The following four macros produce codes (control character or
4169    escape sequence) for ISO2022 locking-shift functions (shift-in,
4170    shift-out, locking-shift-2, and locking-shift-3).  */
4171
4172 #define ENCODE_SHIFT_IN                                 \
4173   do {                                                  \
4174     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
4175     CODING_ISO_INVOCATION (coding, 0) = 0;              \
4176   } while (0)
4177
4178
4179 #define ENCODE_SHIFT_OUT                                \
4180   do {                                                  \
4181     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
4182     CODING_ISO_INVOCATION (coding, 0) = 1;              \
4183   } while (0)
4184
4185
4186 #define ENCODE_LOCKING_SHIFT_2                          \
4187   do {                                                  \
4188     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4189     CODING_ISO_INVOCATION (coding, 0) = 2;              \
4190   } while (0)
4191
4192
4193 #define ENCODE_LOCKING_SHIFT_3                          \
4194   do {                                                  \
4195     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
4196     CODING_ISO_INVOCATION (coding, 0) = 3;              \
4197   } while (0)
4198
4199
4200 /* Produce codes for a DIMENSION1 character whose character set is
4201    CHARSET and whose position-code is C1.  Designation and invocation
4202    sequences are also produced in advance if necessary.  */
4203
4204 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
4205   do {                                                                  \
4206     int id = CHARSET_ID (charset);                                      \
4207                                                                         \
4208     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
4209         && id == charset_ascii)                                         \
4210       {                                                                 \
4211         id = charset_jisx0201_roman;                                    \
4212         charset = CHARSET_FROM_ID (id);                                 \
4213       }                                                                 \
4214                                                                         \
4215     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4216       {                                                                 \
4217         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4218           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
4219         else                                                            \
4220           EMIT_ONE_BYTE (c1 | 0x80);                                    \
4221         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4222         break;                                                          \
4223       }                                                                 \
4224     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4225       {                                                                 \
4226         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
4227         break;                                                          \
4228       }                                                                 \
4229     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4230       {                                                                 \
4231         EMIT_ONE_BYTE (c1 | 0x80);                                      \
4232         break;                                                          \
4233       }                                                                 \
4234     else                                                                \
4235       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4236          must invoke it, or, at first, designate it to some graphic     \
4237          register.  Then repeat the loop to actually produce the        \
4238          character.  */                                                 \
4239       dst = encode_invocation_designation (charset, coding, dst,        \
4240                                            &produced_chars);            \
4241   } while (1)
4242
4243
4244 /* Produce codes for a DIMENSION2 character whose character set is
4245    CHARSET and whose position-codes are C1 and C2.  Designation and
4246    invocation codes are also produced in advance if necessary.  */
4247
4248 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
4249   do {                                                                  \
4250     int id = CHARSET_ID (charset);                                      \
4251                                                                         \
4252     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
4253         && id == charset_jisx0208)                                      \
4254       {                                                                 \
4255         id = charset_jisx0208_1978;                                     \
4256         charset = CHARSET_FROM_ID (id);                                 \
4257       }                                                                 \
4258                                                                         \
4259     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
4260       {                                                                 \
4261         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
4262           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
4263         else                                                            \
4264           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
4265         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
4266         break;                                                          \
4267       }                                                                 \
4268     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
4269       {                                                                 \
4270         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
4271         break;                                                          \
4272       }                                                                 \
4273     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
4274       {                                                                 \
4275         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
4276         break;                                                          \
4277       }                                                                 \
4278     else                                                                \
4279       /* Since CHARSET is not yet invoked to any graphic planes, we     \
4280          must invoke it, or, at first, designate it to some graphic     \
4281          register.  Then repeat the loop to actually produce the        \
4282          character.  */                                                 \
4283       dst = encode_invocation_designation (charset, coding, dst,        \
4284                                            &produced_chars);            \
4285   } while (1)
4286
4287
4288 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
4289   do {                                                                     \
4290     int code = ENCODE_CHAR ((charset),(c));                                \
4291                                                                            \
4292     if (CHARSET_DIMENSION (charset) == 1)                                  \
4293       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
4294     else                                                                   \
4295       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
4296   } while (0)
4297
4298
4299 /* Produce designation and invocation codes at a place pointed by DST
4300    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
4301    Return new DST.  */
4302
4303 unsigned char *
4304 encode_invocation_designation (charset, coding, dst, p_nchars)
4305      struct charset *charset;
4306      struct coding_system *coding;
4307      unsigned char *dst;
4308      int *p_nchars;
4309 {
4310   int multibytep = coding->dst_multibyte;
4311   int produced_chars = *p_nchars;
4312   int reg;                      /* graphic register number */
4313   int id = CHARSET_ID (charset);
4314
4315   /* At first, check designations.  */
4316   for (reg = 0; reg < 4; reg++)
4317     if (id == CODING_ISO_DESIGNATION (coding, reg))
4318       break;
4319
4320   if (reg >= 4)
4321     {
4322       /* CHARSET is not yet designated to any graphic registers.  */
4323       /* At first check the requested designation.  */
4324       reg = CODING_ISO_REQUEST (coding, id);
4325       if (reg < 0)
4326         /* Since CHARSET requests no special designation, designate it
4327            to graphic register 0.  */
4328         reg = 0;
4329
4330       ENCODE_DESIGNATION (charset, reg, coding);
4331     }
4332
4333   if (CODING_ISO_INVOCATION (coding, 0) != reg
4334       && CODING_ISO_INVOCATION (coding, 1) != reg)
4335     {
4336       /* Since the graphic register REG is not invoked to any graphic
4337          planes, invoke it to graphic plane 0.  */
4338       switch (reg)
4339         {
4340         case 0:                 /* graphic register 0 */
4341           ENCODE_SHIFT_IN;
4342           break;
4343
4344         case 1:                 /* graphic register 1 */
4345           ENCODE_SHIFT_OUT;
4346           break;
4347
4348         case 2:                 /* graphic register 2 */
4349           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4350             ENCODE_SINGLE_SHIFT_2;
4351           else
4352             ENCODE_LOCKING_SHIFT_2;
4353           break;
4354
4355         case 3:                 /* graphic register 3 */
4356           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
4357             ENCODE_SINGLE_SHIFT_3;
4358           else
4359             ENCODE_LOCKING_SHIFT_3;
4360           break;
4361         }
4362     }
4363
4364   *p_nchars = produced_chars;
4365   return dst;
4366 }
4367
4368 /* The following three macros produce codes for indicating direction
4369    of text.  */
4370 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
4371   do {                                                                  \
4372     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
4373       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
4374     else                                                                \
4375       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
4376   } while (0)
4377
4378
4379 #define ENCODE_DIRECTION_R2L()                  \
4380   do {                                          \
4381     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4382     EMIT_TWO_ASCII_BYTES ('2', ']');            \
4383   } while (0)
4384
4385
4386 #define ENCODE_DIRECTION_L2R()                  \
4387   do {                                          \
4388     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
4389     EMIT_TWO_ASCII_BYTES ('0', ']');            \
4390   } while (0)
4391
4392
4393 /* Produce codes for designation and invocation to reset the graphic
4394    planes and registers to initial state.  */
4395 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
4396   do {                                                                  \
4397     int reg;                                                            \
4398     struct charset *charset;                                            \
4399                                                                         \
4400     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
4401       ENCODE_SHIFT_IN;                                                  \
4402     for (reg = 0; reg < 4; reg++)                                       \
4403       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
4404           && (CODING_ISO_DESIGNATION (coding, reg)                      \
4405               != CODING_ISO_INITIAL (coding, reg)))                     \
4406         {                                                               \
4407           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
4408           ENCODE_DESIGNATION (charset, reg, coding);                    \
4409         }                                                               \
4410   } while (0)
4411
4412
4413 /* Produce designation sequences of charsets in the line started from
4414    SRC to a place pointed by DST, and return updated DST.
4415
4416    If the current block ends before any end-of-line, we may fail to
4417    find all the necessary designations.  */
4418
4419 static unsigned char *
4420 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
4421      struct coding_system *coding;
4422      int *charbuf, *charbuf_end;
4423      unsigned char *dst;
4424 {
4425   struct charset *charset;
4426   /* Table of charsets to be designated to each graphic register.  */
4427   int r[4];
4428   int c, found = 0, reg;
4429   int produced_chars = 0;
4430   int multibytep = coding->dst_multibyte;
4431   Lisp_Object attrs;
4432   Lisp_Object charset_list;
4433
4434   attrs = CODING_ID_ATTRS (coding->id);
4435   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4436   if (EQ (charset_list, Qiso_2022))
4437     charset_list = Viso_2022_charset_list;
4438
4439   for (reg = 0; reg < 4; reg++)
4440     r[reg] = -1;
4441
4442   while (found < 4)
4443     {
4444       int id;
4445
4446       c = *charbuf++;
4447       if (c == '\n')
4448         break;
4449       charset = char_charset (c, charset_list, NULL);
4450       id = CHARSET_ID (charset);
4451       reg = CODING_ISO_REQUEST (coding, id);
4452       if (reg >= 0 && r[reg] < 0)
4453         {
4454           found++;
4455           r[reg] = id;
4456         }
4457     }
4458
4459   if (found)
4460     {
4461       for (reg = 0; reg < 4; reg++)
4462         if (r[reg] >= 0
4463             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
4464           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
4465     }
4466
4467   return dst;
4468 }
4469
4470 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
4471
4472 static int
4473 encode_coding_iso_2022 (coding)
4474      struct coding_system *coding;
4475 {
4476   int multibytep = coding->dst_multibyte;
4477   int *charbuf = coding->charbuf;
4478   int *charbuf_end = charbuf + coding->charbuf_used;
4479   unsigned char *dst = coding->destination + coding->produced;
4480   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4481   int safe_room = 16;
4482   int bol_designation
4483     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
4484        && CODING_ISO_BOL (coding));
4485   int produced_chars = 0;
4486   Lisp_Object attrs, eol_type, charset_list;
4487   int ascii_compatible;
4488   int c;
4489   int preferred_charset_id = -1;
4490
4491   CODING_GET_INFO (coding, attrs, charset_list);
4492   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
4493   if (VECTORP (eol_type))
4494     eol_type = Qunix;
4495
4496   setup_iso_safe_charsets (attrs);
4497   /* Charset list may have been changed.  */
4498   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
4499   coding->safe_charsets = SDATA (CODING_ATTR_SAFE_CHARSETS (attrs));
4500
4501   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4502
4503   while (charbuf < charbuf_end)
4504     {
4505       ASSURE_DESTINATION (safe_room);
4506
4507       if (bol_designation)
4508         {
4509           unsigned char *dst_prev = dst;
4510
4511           /* We have to produce designation sequences if any now.  */
4512           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
4513           bol_designation = 0;
4514           /* We are sure that designation sequences are all ASCII bytes.  */
4515           produced_chars += dst - dst_prev;
4516         }
4517
4518       c = *charbuf++;
4519
4520       if (c < 0)
4521         {
4522           /* Handle an annotation.  */
4523           switch (*charbuf)
4524             {
4525             case CODING_ANNOTATE_COMPOSITION_MASK:
4526               /* Not yet implemented.  */
4527               break;
4528             case CODING_ANNOTATE_CHARSET_MASK:
4529               preferred_charset_id = charbuf[2];
4530               if (preferred_charset_id >= 0
4531                   && NILP (Fmemq (make_number (preferred_charset_id),
4532                                   charset_list)))
4533                 preferred_charset_id = -1;
4534               break;
4535             default:
4536               abort ();
4537             }
4538           charbuf += -c - 1;
4539           continue;
4540         }
4541
4542       /* Now encode the character C.  */
4543       if (c < 0x20 || c == 0x7F)
4544         {
4545           if (c == '\n'
4546               || (c == '\r' && EQ (eol_type, Qmac)))
4547             {
4548               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4549                 ENCODE_RESET_PLANE_AND_REGISTER ();
4550               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
4551                 {
4552                   int i;
4553
4554                   for (i = 0; i < 4; i++)
4555                     CODING_ISO_DESIGNATION (coding, i)
4556                       = CODING_ISO_INITIAL (coding, i);
4557                 }
4558               bol_designation
4559                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
4560             }
4561           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
4562             ENCODE_RESET_PLANE_AND_REGISTER ();
4563           EMIT_ONE_ASCII_BYTE (c);
4564         }
4565       else if (ASCII_CHAR_P (c))
4566         {
4567           if (ascii_compatible)
4568             EMIT_ONE_ASCII_BYTE (c);
4569           else
4570             {
4571               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
4572               ENCODE_ISO_CHARACTER (charset, c);
4573             }
4574         }
4575       else if (CHAR_BYTE8_P (c))
4576         {
4577           c = CHAR_TO_BYTE8 (c);
4578           EMIT_ONE_BYTE (c);
4579         }
4580       else
4581         {
4582           struct charset *charset;
4583
4584           if (preferred_charset_id >= 0)
4585             {
4586               charset = CHARSET_FROM_ID (preferred_charset_id);
4587               if (! CHAR_CHARSET_P (c, charset))
4588                 charset = char_charset (c, charset_list, NULL);
4589             }
4590           else
4591             charset = char_charset (c, charset_list, NULL);
4592           if (!charset)
4593             {
4594               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4595                 {
4596                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4597                   charset = CHARSET_FROM_ID (charset_ascii);
4598                 }
4599               else
4600                 {
4601                   c = coding->default_char;
4602                   charset = char_charset (c, charset_list, NULL);
4603                 }
4604             }
4605           ENCODE_ISO_CHARACTER (charset, c);
4606         }
4607     }
4608
4609   if (coding->mode & CODING_MODE_LAST_BLOCK
4610       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
4611     {
4612       ASSURE_DESTINATION (safe_room);
4613       ENCODE_RESET_PLANE_AND_REGISTER ();
4614     }
4615   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4616   CODING_ISO_BOL (coding) = bol_designation;
4617   coding->produced_char += produced_chars;
4618   coding->produced = dst - coding->destination;
4619   return 0;
4620 }
4621
4622 \f
4623 /*** 8,9. SJIS and BIG5 handlers ***/
4624
4625 /* Although SJIS and BIG5 are not ISO's coding system, they are used
4626    quite widely.  So, for the moment, Emacs supports them in the bare
4627    C code.  But, in the future, they may be supported only by CCL.  */
4628
4629 /* SJIS is a coding system encoding three character sets: ASCII, right
4630    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
4631    as is.  A character of charset katakana-jisx0201 is encoded by
4632    "position-code + 0x80".  A character of charset japanese-jisx0208
4633    is encoded in 2-byte but two position-codes are divided and shifted
4634    so that it fit in the range below.
4635
4636    --- CODE RANGE of SJIS ---
4637    (character set)      (range)
4638    ASCII                0x00 .. 0x7F
4639    KATAKANA-JISX0201    0xA0 .. 0xDF
4640    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
4641             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
4642    -------------------------------
4643
4644 */
4645
4646 /* BIG5 is a coding system encoding two character sets: ASCII and
4647    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
4648    character set and is encoded in two-byte.
4649
4650    --- CODE RANGE of BIG5 ---
4651    (character set)      (range)
4652    ASCII                0x00 .. 0x7F
4653    Big5 (1st byte)      0xA1 .. 0xFE
4654         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
4655    --------------------------
4656
4657   */
4658
4659 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4660    Check if a text is encoded in SJIS.  If it is, return
4661    CATEGORY_MASK_SJIS, else return 0.  */
4662
4663 static int
4664 detect_coding_sjis (coding, detect_info)
4665      struct coding_system *coding;
4666      struct coding_detection_info *detect_info;
4667 {
4668   const unsigned char *src = coding->source, *src_base;
4669   const unsigned char *src_end = coding->source + coding->src_bytes;
4670   int multibytep = coding->src_multibyte;
4671   int consumed_chars = 0;
4672   int found = 0;
4673   int c;
4674   Lisp_Object attrs, charset_list;
4675   int max_first_byte_of_2_byte_code;
4676
4677   CODING_GET_INFO (coding, attrs, charset_list);
4678   max_first_byte_of_2_byte_code
4679     = (XINT (Flength (charset_list)) > 3 ? 0xFC : 0xEF);
4680
4681   detect_info->checked |= CATEGORY_MASK_SJIS;
4682   /* A coding system of this category is always ASCII compatible.  */
4683   src += coding->head_ascii;
4684
4685   while (1)
4686     {
4687       src_base = src;
4688       ONE_MORE_BYTE (c);
4689       if (c < 0x80)
4690         continue;
4691       if ((c >= 0x81 && c <= 0x9F)
4692           || (c >= 0xE0 && c <= max_first_byte_of_2_byte_code))
4693         {
4694           ONE_MORE_BYTE (c);
4695           if (c < 0x40 || c == 0x7F || c > 0xFC)
4696             break;
4697           found = CATEGORY_MASK_SJIS;
4698         }
4699       else if (c >= 0xA0 && c < 0xE0)
4700         found = CATEGORY_MASK_SJIS;
4701       else
4702         break;
4703     }
4704   detect_info->rejected |= CATEGORY_MASK_SJIS;
4705   return 0;
4706
4707  no_more_source:
4708   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4709     {
4710       detect_info->rejected |= CATEGORY_MASK_SJIS;
4711       return 0;
4712     }
4713   detect_info->found |= found;
4714   return 1;
4715 }
4716
4717 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4718    Check if a text is encoded in BIG5.  If it is, return
4719    CATEGORY_MASK_BIG5, else return 0.  */
4720
4721 static int
4722 detect_coding_big5 (coding, detect_info)
4723      struct coding_system *coding;
4724      struct coding_detection_info *detect_info;
4725 {
4726   const unsigned char *src = coding->source, *src_base;
4727   const unsigned char *src_end = coding->source + coding->src_bytes;
4728   int multibytep = coding->src_multibyte;
4729   int consumed_chars = 0;
4730   int found = 0;
4731   int c;
4732
4733   detect_info->checked |= CATEGORY_MASK_BIG5;
4734   /* A coding system of this category is always ASCII compatible.  */
4735   src += coding->head_ascii;
4736
4737   while (1)
4738     {
4739       src_base = src;
4740       ONE_MORE_BYTE (c);
4741       if (c < 0x80)
4742         continue;
4743       if (c >= 0xA1)
4744         {
4745           ONE_MORE_BYTE (c);
4746           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4747             return 0;
4748           found = CATEGORY_MASK_BIG5;
4749         }
4750       else
4751         break;
4752     }
4753   detect_info->rejected |= CATEGORY_MASK_BIG5;
4754   return 0;
4755
4756  no_more_source:
4757   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4758     {
4759       detect_info->rejected |= CATEGORY_MASK_BIG5;
4760       return 0;
4761     }
4762   detect_info->found |= found;
4763   return 1;
4764 }
4765
4766 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4767    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4768
4769 static void
4770 decode_coding_sjis (coding)
4771      struct coding_system *coding;
4772 {
4773   const unsigned char *src = coding->source + coding->consumed;
4774   const unsigned char *src_end = coding->source + coding->src_bytes;
4775   const unsigned char *src_base;
4776   int *charbuf = coding->charbuf + coding->charbuf_used;
4777   /* We may produce one charset annocation in one loop and one more at
4778      the end.  */
4779   int *charbuf_end
4780     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4781   int consumed_chars = 0, consumed_chars_base;
4782   int multibytep = coding->src_multibyte;
4783   struct charset *charset_roman, *charset_kanji, *charset_kana;
4784   struct charset *charset_kanji2;
4785   Lisp_Object attrs, charset_list, val;
4786   int char_offset = coding->produced_char;
4787   int last_offset = char_offset;
4788   int last_id = charset_ascii;
4789   int eol_crlf =
4790     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4791   int byte_after_cr = -1;
4792
4793   CODING_GET_INFO (coding, attrs, charset_list);
4794
4795   val = charset_list;
4796   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4797   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4798   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4799   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4800
4801   while (1)
4802     {
4803       int c, c1;
4804       struct charset *charset;
4805
4806       src_base = src;
4807       consumed_chars_base = consumed_chars;
4808
4809       if (charbuf >= charbuf_end)
4810         {
4811           if (byte_after_cr >= 0)
4812             src_base--;
4813           break;
4814         }
4815
4816       if (byte_after_cr >= 0)
4817         c = byte_after_cr, byte_after_cr = -1;
4818       else
4819         ONE_MORE_BYTE (c);
4820       if (c < 0)
4821         goto invalid_code;
4822       if (c < 0x80)
4823         {
4824           if (eol_crlf && c == '\r')
4825             ONE_MORE_BYTE (byte_after_cr);
4826           charset = charset_roman;
4827         }
4828       else if (c == 0x80 || c == 0xA0)
4829         goto invalid_code;
4830       else if (c >= 0xA1 && c <= 0xDF)
4831         {
4832           /* SJIS -> JISX0201-Kana */
4833           c &= 0x7F;
4834           charset = charset_kana;
4835         }
4836       else if (c <= 0xEF)
4837         {
4838           /* SJIS -> JISX0208 */
4839           ONE_MORE_BYTE (c1);
4840           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4841             goto invalid_code;
4842           c = (c << 8) | c1;
4843           SJIS_TO_JIS (c);
4844           charset = charset_kanji;
4845         }
4846       else if (c <= 0xFC && charset_kanji2)
4847         {
4848           /* SJIS -> JISX0213-2 */
4849           ONE_MORE_BYTE (c1);
4850           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4851             goto invalid_code;
4852           c = (c << 8) | c1;
4853           SJIS_TO_JIS2 (c);
4854           charset = charset_kanji2;
4855         }
4856       else
4857         goto invalid_code;
4858       if (charset->id != charset_ascii
4859           && last_id != charset->id)
4860         {
4861           if (last_id != charset_ascii)
4862             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4863           last_id = charset->id;
4864           last_offset = char_offset;
4865         }
4866       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4867       *charbuf++ = c;
4868       char_offset++;
4869       continue;
4870
4871     invalid_code:
4872       src = src_base;
4873       consumed_chars = consumed_chars_base;
4874       ONE_MORE_BYTE (c);
4875       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4876       char_offset++;
4877       coding->errors++;
4878     }
4879
4880  no_more_source:
4881   if (last_id != charset_ascii)
4882     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4883   coding->consumed_char += consumed_chars_base;
4884   coding->consumed = src_base - coding->source;
4885   coding->charbuf_used = charbuf - coding->charbuf;
4886 }
4887
4888 static void
4889 decode_coding_big5 (coding)
4890      struct coding_system *coding;
4891 {
4892   const unsigned char *src = coding->source + coding->consumed;
4893   const unsigned char *src_end = coding->source + coding->src_bytes;
4894   const unsigned char *src_base;
4895   int *charbuf = coding->charbuf + coding->charbuf_used;
4896   /* We may produce one charset annocation in one loop and one more at
4897      the end.  */
4898   int *charbuf_end
4899     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
4900   int consumed_chars = 0, consumed_chars_base;
4901   int multibytep = coding->src_multibyte;
4902   struct charset *charset_roman, *charset_big5;
4903   Lisp_Object attrs, charset_list, val;
4904   int char_offset = coding->produced_char;
4905   int last_offset = char_offset;
4906   int last_id = charset_ascii;
4907   int eol_crlf =
4908     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
4909   int byte_after_cr = -1;
4910
4911   CODING_GET_INFO (coding, attrs, charset_list);
4912   val = charset_list;
4913   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4914   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4915
4916   while (1)
4917     {
4918       int c, c1;
4919       struct charset *charset;
4920
4921       src_base = src;
4922       consumed_chars_base = consumed_chars;
4923
4924       if (charbuf >= charbuf_end)
4925         {
4926           if (byte_after_cr >= 0)
4927             src_base--;
4928           break;
4929         }
4930
4931       if (byte_after_cr >= 0)
4932         c = byte_after_cr, byte_after_cr = -1;
4933       else
4934         ONE_MORE_BYTE (c);
4935
4936       if (c < 0)
4937         goto invalid_code;
4938       if (c < 0x80)
4939         {
4940           if (eol_crlf && c == '\r')
4941             ONE_MORE_BYTE (byte_after_cr);
4942           charset = charset_roman;
4943         }
4944       else
4945         {
4946           /* BIG5 -> Big5 */
4947           if (c < 0xA1 || c > 0xFE)
4948             goto invalid_code;
4949           ONE_MORE_BYTE (c1);
4950           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4951             goto invalid_code;
4952           c = c << 8 | c1;
4953           charset = charset_big5;
4954         }
4955       if (charset->id != charset_ascii
4956           && last_id != charset->id)
4957         {
4958           if (last_id != charset_ascii)
4959             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4960           last_id = charset->id;
4961           last_offset = char_offset;
4962         }
4963       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4964       *charbuf++ = c;
4965       char_offset++;
4966       continue;
4967
4968     invalid_code:
4969       src = src_base;
4970       consumed_chars = consumed_chars_base;
4971       ONE_MORE_BYTE (c);
4972       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4973       char_offset++;
4974       coding->errors++;
4975     }
4976
4977  no_more_source:
4978   if (last_id != charset_ascii)
4979     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4980   coding->consumed_char += consumed_chars_base;
4981   coding->consumed = src_base - coding->source;
4982   coding->charbuf_used = charbuf - coding->charbuf;
4983 }
4984
4985 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4986    This function can encode charsets `ascii', `katakana-jisx0201',
4987    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4988    are sure that all these charsets are registered as official charset
4989    (i.e. do not have extended leading-codes).  Characters of other
4990    charsets are produced without any encoding.  If SJIS_P is 1, encode
4991    SJIS text, else encode BIG5 text.  */
4992
4993 static int
4994 encode_coding_sjis (coding)
4995      struct coding_system *coding;
4996 {
4997   int multibytep = coding->dst_multibyte;
4998   int *charbuf = coding->charbuf;
4999   int *charbuf_end = charbuf + coding->charbuf_used;
5000   unsigned char *dst = coding->destination + coding->produced;
5001   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5002   int safe_room = 4;
5003   int produced_chars = 0;
5004   Lisp_Object attrs, charset_list, val;
5005   int ascii_compatible;
5006   struct charset *charset_roman, *charset_kanji, *charset_kana;
5007   struct charset *charset_kanji2;
5008   int c;
5009
5010   CODING_GET_INFO (coding, attrs, charset_list);
5011   val = charset_list;
5012   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5013   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5014   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5015   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
5016
5017   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5018
5019   while (charbuf < charbuf_end)
5020     {
5021       ASSURE_DESTINATION (safe_room);
5022       c = *charbuf++;
5023       /* Now encode the character C.  */
5024       if (ASCII_CHAR_P (c) && ascii_compatible)
5025         EMIT_ONE_ASCII_BYTE (c);
5026       else if (CHAR_BYTE8_P (c))
5027         {
5028           c = CHAR_TO_BYTE8 (c);
5029           EMIT_ONE_BYTE (c);
5030         }
5031       else
5032         {
5033           unsigned code;
5034           struct charset *charset = char_charset (c, charset_list, &code);
5035
5036           if (!charset)
5037             {
5038               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5039                 {
5040                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5041                   charset = CHARSET_FROM_ID (charset_ascii);
5042                 }
5043               else
5044                 {
5045                   c = coding->default_char;
5046                   charset = char_charset (c, charset_list, &code);
5047                 }
5048             }
5049           if (code == CHARSET_INVALID_CODE (charset))
5050             abort ();
5051           if (charset == charset_kanji)
5052             {
5053               int c1, c2;
5054               JIS_TO_SJIS (code);
5055               c1 = code >> 8, c2 = code & 0xFF;
5056               EMIT_TWO_BYTES (c1, c2);
5057             }
5058           else if (charset == charset_kana)
5059             EMIT_ONE_BYTE (code | 0x80);
5060           else if (charset_kanji2 && charset == charset_kanji2)
5061             {
5062               int c1, c2;
5063
5064               c1 = code >> 8;
5065               if (c1 == 0x21 || (c1 >= 0x23 && c1 <= 0x25)
5066                   || c1 == 0x28
5067                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
5068                 {
5069                   JIS_TO_SJIS2 (code);
5070                   c1 = code >> 8, c2 = code & 0xFF;
5071                   EMIT_TWO_BYTES (c1, c2);
5072                 }
5073               else
5074                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
5075             }
5076           else
5077             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5078         }
5079     }
5080   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5081   coding->produced_char += produced_chars;
5082   coding->produced = dst - coding->destination;
5083   return 0;
5084 }
5085
5086 static int
5087 encode_coding_big5 (coding)
5088      struct coding_system *coding;
5089 {
5090   int multibytep = coding->dst_multibyte;
5091   int *charbuf = coding->charbuf;
5092   int *charbuf_end = charbuf + coding->charbuf_used;
5093   unsigned char *dst = coding->destination + coding->produced;
5094   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5095   int safe_room = 4;
5096   int produced_chars = 0;
5097   Lisp_Object attrs, charset_list, val;
5098   int ascii_compatible;
5099   struct charset *charset_roman, *charset_big5;
5100   int c;
5101
5102   CODING_GET_INFO (coding, attrs, charset_list);
5103   val = charset_list;
5104   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
5105   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
5106   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5107
5108   while (charbuf < charbuf_end)
5109     {
5110       ASSURE_DESTINATION (safe_room);
5111       c = *charbuf++;
5112       /* Now encode the character C.  */
5113       if (ASCII_CHAR_P (c) && ascii_compatible)
5114         EMIT_ONE_ASCII_BYTE (c);
5115       else if (CHAR_BYTE8_P (c))
5116         {
5117           c = CHAR_TO_BYTE8 (c);
5118           EMIT_ONE_BYTE (c);
5119         }
5120       else
5121         {
5122           unsigned code;
5123           struct charset *charset = char_charset (c, charset_list, &code);
5124
5125           if (! charset)
5126             {
5127               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5128                 {
5129                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5130                   charset = CHARSET_FROM_ID (charset_ascii);
5131                 }
5132               else
5133                 {
5134                   c = coding->default_char;
5135                   charset = char_charset (c, charset_list, &code);
5136                 }
5137             }
5138           if (code == CHARSET_INVALID_CODE (charset))
5139             abort ();
5140           if (charset == charset_big5)
5141             {
5142               int c1, c2;
5143
5144               c1 = code >> 8, c2 = code & 0xFF;
5145               EMIT_TWO_BYTES (c1, c2);
5146             }
5147           else
5148             EMIT_ONE_ASCII_BYTE (code & 0x7F);
5149         }
5150     }
5151   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5152   coding->produced_char += produced_chars;
5153   coding->produced = dst - coding->destination;
5154   return 0;
5155 }
5156
5157 \f
5158 /*** 10. CCL handlers ***/
5159
5160 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5161    Check if a text is encoded in a coding system of which
5162    encoder/decoder are written in CCL program.  If it is, return
5163    CATEGORY_MASK_CCL, else return 0.  */
5164
5165 static int
5166 detect_coding_ccl (coding, detect_info)
5167      struct coding_system *coding;
5168      struct coding_detection_info *detect_info;
5169 {
5170   const unsigned char *src = coding->source, *src_base;
5171   const unsigned char *src_end = coding->source + coding->src_bytes;
5172   int multibytep = coding->src_multibyte;
5173   int consumed_chars = 0;
5174   int found = 0;
5175   unsigned char *valids;
5176   int head_ascii = coding->head_ascii;
5177   Lisp_Object attrs;
5178
5179   detect_info->checked |= CATEGORY_MASK_CCL;
5180
5181   coding = &coding_categories[coding_category_ccl];
5182   valids = CODING_CCL_VALIDS (coding);
5183   attrs = CODING_ID_ATTRS (coding->id);
5184   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5185     src += head_ascii;
5186
5187   while (1)
5188     {
5189       int c;
5190
5191       src_base = src;
5192       ONE_MORE_BYTE (c);
5193       if (c < 0 || ! valids[c])
5194         break;
5195       if ((valids[c] > 1))
5196         found = CATEGORY_MASK_CCL;
5197     }
5198   detect_info->rejected |= CATEGORY_MASK_CCL;
5199   return 0;
5200
5201  no_more_source:
5202   detect_info->found |= found;
5203   return 1;
5204 }
5205
5206 static void
5207 decode_coding_ccl (coding)
5208      struct coding_system *coding;
5209 {
5210   const unsigned char *src = coding->source + coding->consumed;
5211   const unsigned char *src_end = coding->source + coding->src_bytes;
5212   int *charbuf = coding->charbuf + coding->charbuf_used;
5213   int *charbuf_end = coding->charbuf + coding->charbuf_size;
5214   int consumed_chars = 0;
5215   int multibytep = coding->src_multibyte;
5216   struct ccl_program ccl;
5217   int source_charbuf[1024];
5218   int source_byteidx[1024];
5219   Lisp_Object attrs, charset_list;
5220
5221   CODING_GET_INFO (coding, attrs, charset_list);
5222   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
5223
5224   while (src < src_end)
5225     {
5226       const unsigned char *p = src;
5227       int *source, *source_end;
5228       int i = 0;
5229
5230       if (multibytep)
5231         while (i < 1024 && p < src_end)
5232           {
5233             source_byteidx[i] = p - src;
5234             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
5235           }
5236       else
5237         while (i < 1024 && p < src_end)
5238           source_charbuf[i++] = *p++;
5239
5240       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
5241         ccl.last_block = 1;
5242
5243       source = source_charbuf;
5244       source_end = source + i;
5245       while (source < source_end)
5246         {
5247           ccl_driver (&ccl, source, charbuf,
5248                       source_end - source, charbuf_end - charbuf,
5249                       charset_list);
5250           source += ccl.consumed;
5251           charbuf += ccl.produced;
5252           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
5253             break;
5254         }
5255       if (source < source_end)
5256         src += source_byteidx[source - source_charbuf];
5257       else
5258         src = p;
5259       consumed_chars += source - source_charbuf;
5260
5261       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
5262           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
5263         break;
5264     }
5265
5266   switch (ccl.status)
5267     {
5268     case CCL_STAT_SUSPEND_BY_SRC:
5269       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5270       break;
5271     case CCL_STAT_SUSPEND_BY_DST:
5272       break;
5273     case CCL_STAT_QUIT:
5274     case CCL_STAT_INVALID_CMD:
5275       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5276       break;
5277     default:
5278       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5279       break;
5280     }
5281   coding->consumed_char += consumed_chars;
5282   coding->consumed = src - coding->source;
5283   coding->charbuf_used = charbuf - coding->charbuf;
5284 }
5285
5286 static int
5287 encode_coding_ccl (coding)
5288      struct coding_system *coding;
5289 {
5290   struct ccl_program ccl;
5291   int multibytep = coding->dst_multibyte;
5292   int *charbuf = coding->charbuf;
5293   int *charbuf_end = charbuf + coding->charbuf_used;
5294   unsigned char *dst = coding->destination + coding->produced;
5295   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5296   int destination_charbuf[1024];
5297   int i, produced_chars = 0;
5298   Lisp_Object attrs, charset_list;
5299
5300   CODING_GET_INFO (coding, attrs, charset_list);
5301   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
5302
5303   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
5304   ccl.dst_multibyte = coding->dst_multibyte;
5305
5306   while (charbuf < charbuf_end)
5307     {
5308       ccl_driver (&ccl, charbuf, destination_charbuf,
5309                   charbuf_end - charbuf, 1024, charset_list);
5310       if (multibytep)
5311         {
5312           ASSURE_DESTINATION (ccl.produced * 2);
5313           for (i = 0; i < ccl.produced; i++)
5314             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
5315         }
5316       else
5317         {
5318           ASSURE_DESTINATION (ccl.produced);
5319           for (i = 0; i < ccl.produced; i++)
5320             *dst++ = destination_charbuf[i] & 0xFF;
5321           produced_chars += ccl.produced;
5322         }
5323       charbuf += ccl.consumed;
5324       if (ccl.status == CCL_STAT_QUIT
5325           || ccl.status == CCL_STAT_INVALID_CMD)
5326         break;
5327     }
5328
5329   switch (ccl.status)
5330     {
5331     case CCL_STAT_SUSPEND_BY_SRC:
5332       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5333       break;
5334     case CCL_STAT_SUSPEND_BY_DST:
5335       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
5336       break;
5337     case CCL_STAT_QUIT:
5338     case CCL_STAT_INVALID_CMD:
5339       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
5340       break;
5341     default:
5342       record_conversion_result (coding, CODING_RESULT_SUCCESS);
5343       break;
5344     }
5345
5346   coding->produced_char += produced_chars;
5347   coding->produced = dst - coding->destination;
5348   return 0;
5349 }
5350
5351
5352 \f
5353 /*** 10, 11. no-conversion handlers ***/
5354
5355 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
5356
5357 static void
5358 decode_coding_raw_text (coding)
5359      struct coding_system *coding;
5360 {
5361   int eol_crlf =
5362     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5363
5364   coding->chars_at_source = 1;
5365   coding->consumed_char = coding->src_chars;
5366   coding->consumed = coding->src_bytes;
5367   if (eol_crlf && coding->source[coding->src_bytes - 1] == '\r')
5368     {
5369       coding->consumed_char--;
5370       coding->consumed--;
5371       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
5372     }
5373   else
5374     record_conversion_result (coding, CODING_RESULT_SUCCESS);
5375 }
5376
5377 static int
5378 encode_coding_raw_text (coding)
5379      struct coding_system *coding;
5380 {
5381   int multibytep = coding->dst_multibyte;
5382   int *charbuf = coding->charbuf;
5383   int *charbuf_end = coding->charbuf + coding->charbuf_used;
5384   unsigned char *dst = coding->destination + coding->produced;
5385   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5386   int produced_chars = 0;
5387   int c;
5388
5389   if (multibytep)
5390     {
5391       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
5392
5393       if (coding->src_multibyte)
5394         while (charbuf < charbuf_end)
5395           {
5396             ASSURE_DESTINATION (safe_room);
5397             c = *charbuf++;
5398             if (ASCII_CHAR_P (c))
5399               EMIT_ONE_ASCII_BYTE (c);
5400             else if (CHAR_BYTE8_P (c))
5401               {
5402                 c = CHAR_TO_BYTE8 (c);
5403                 EMIT_ONE_BYTE (c);
5404               }
5405             else
5406               {
5407                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
5408
5409                 CHAR_STRING_ADVANCE (c, p1);
5410                 while (p0 < p1)
5411                   {
5412                     EMIT_ONE_BYTE (*p0);
5413                     p0++;
5414                   }
5415               }
5416           }
5417       else
5418         while (charbuf < charbuf_end)
5419           {
5420             ASSURE_DESTINATION (safe_room);
5421             c = *charbuf++;
5422             EMIT_ONE_BYTE (c);
5423           }
5424     }
5425   else
5426     {
5427       if (coding->src_multibyte)
5428         {
5429           int safe_room = MAX_MULTIBYTE_LENGTH;
5430
5431           while (charbuf < charbuf_end)
5432             {
5433               ASSURE_DESTINATION (safe_room);
5434               c = *charbuf++;
5435               if (ASCII_CHAR_P (c))
5436                 *dst++ = c;
5437               else if (CHAR_BYTE8_P (c))
5438                 *dst++ = CHAR_TO_BYTE8 (c);
5439               else
5440                 CHAR_STRING_ADVANCE (c, dst);
5441             }
5442         }
5443       else
5444         {
5445           ASSURE_DESTINATION (charbuf_end - charbuf);
5446           while (charbuf < charbuf_end && dst < dst_end)
5447             *dst++ = *charbuf++;
5448         }
5449       produced_chars = dst - (coding->destination + coding->produced);
5450     }
5451   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5452   coding->produced_char += produced_chars;
5453   coding->produced = dst - coding->destination;
5454   return 0;
5455 }
5456
5457 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
5458    Check if a text is encoded in a charset-based coding system.  If it
5459    is, return 1, else return 0.  */
5460
5461 static int
5462 detect_coding_charset (coding, detect_info)
5463      struct coding_system *coding;
5464      struct coding_detection_info *detect_info;
5465 {
5466   const unsigned char *src = coding->source, *src_base;
5467   const unsigned char *src_end = coding->source + coding->src_bytes;
5468   int multibytep = coding->src_multibyte;
5469   int consumed_chars = 0;
5470   Lisp_Object attrs, valids, name;
5471   int found = 0;
5472   int head_ascii = coding->head_ascii;
5473   int check_latin_extra = 0;
5474
5475   detect_info->checked |= CATEGORY_MASK_CHARSET;
5476
5477   coding = &coding_categories[coding_category_charset];
5478   attrs = CODING_ID_ATTRS (coding->id);
5479   valids = AREF (attrs, coding_attr_charset_valids);
5480   name = CODING_ID_NAME (coding->id);
5481   if (strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5482                "iso-8859-", sizeof ("iso-8859-") - 1) == 0
5483       || strncmp ((char *) SDATA (SYMBOL_NAME (name)),
5484                   "iso-latin-", sizeof ("iso-latin-") - 1) == 0)
5485     check_latin_extra = 1;
5486
5487   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
5488     src += head_ascii;
5489
5490   while (1)
5491     {
5492       int c;
5493       Lisp_Object val;
5494       struct charset *charset;
5495       int dim, idx;
5496
5497       src_base = src;
5498       ONE_MORE_BYTE (c);
5499       if (c < 0)
5500         continue;
5501       val = AREF (valids, c);
5502       if (NILP (val))
5503         break;
5504       if (c >= 0x80)
5505         {
5506           if (c < 0xA0
5507               && check_latin_extra
5508               && (!VECTORP (Vlatin_extra_code_table)
5509                   || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c])))
5510             break;
5511           found = CATEGORY_MASK_CHARSET;
5512         }
5513       if (INTEGERP (val))
5514         {
5515           charset = CHARSET_FROM_ID (XFASTINT (val));
5516           dim = CHARSET_DIMENSION (charset);
5517           for (idx = 1; idx < dim; idx++)
5518             {
5519               if (src == src_end)
5520                 goto too_short;
5521               ONE_MORE_BYTE (c);
5522               if (c < charset->code_space[(dim - 1 - idx) * 2]
5523                   || c > charset->code_space[(dim - 1 - idx) * 2 + 1])
5524                 break;
5525             }
5526           if (idx < dim)
5527             break;
5528         }
5529       else
5530         {
5531           idx = 1;
5532           for (; CONSP (val); val = XCDR (val))
5533             {
5534               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5535               dim = CHARSET_DIMENSION (charset);
5536               while (idx < dim)
5537                 {
5538                   if (src == src_end)
5539                     goto too_short;
5540                   ONE_MORE_BYTE (c);
5541                   if (c < charset->code_space[(dim - 1 - idx) * 4]
5542                       || c > charset->code_space[(dim - 1 - idx) * 4 + 1])
5543                     break;
5544                   idx++;
5545                 }
5546               if (idx == dim)
5547                 {
5548                   val = Qnil;
5549                   break;
5550                 }
5551             }
5552           if (CONSP (val))
5553             break;
5554         }
5555     }
5556  too_short:
5557   detect_info->rejected |= CATEGORY_MASK_CHARSET;
5558   return 0;
5559
5560  no_more_source:
5561   detect_info->found |= found;
5562   return 1;
5563 }
5564
5565 static void
5566 decode_coding_charset (coding)
5567      struct coding_system *coding;
5568 {
5569   const unsigned char *src = coding->source + coding->consumed;
5570   const unsigned char *src_end = coding->source + coding->src_bytes;
5571   const unsigned char *src_base;
5572   int *charbuf = coding->charbuf + coding->charbuf_used;
5573   /* We may produce one charset annocation in one loop and one more at
5574      the end.  */
5575   int *charbuf_end
5576     = coding->charbuf + coding->charbuf_size - (MAX_ANNOTATION_LENGTH * 2);
5577   int consumed_chars = 0, consumed_chars_base;
5578   int multibytep = coding->src_multibyte;
5579   Lisp_Object attrs, charset_list, valids;
5580   int char_offset = coding->produced_char;
5581   int last_offset = char_offset;
5582   int last_id = charset_ascii;
5583   int eol_crlf =
5584     !inhibit_eol_conversion && EQ (CODING_ID_EOL_TYPE (coding->id), Qdos);
5585   int byte_after_cr = -1;
5586
5587   CODING_GET_INFO (coding, attrs, charset_list);
5588   valids = AREF (attrs, coding_attr_charset_valids);
5589
5590   while (1)
5591     {
5592       int c;
5593       Lisp_Object val;
5594       struct charset *charset;
5595       int dim;
5596       int len = 1;
5597       unsigned code;
5598
5599       src_base = src;
5600       consumed_chars_base = consumed_chars;
5601
5602       if (charbuf >= charbuf_end)
5603         {
5604           if (byte_after_cr >= 0)
5605             src_base--;
5606           break;
5607         }
5608
5609       if (byte_after_cr >= 0)
5610         {
5611           c = byte_after_cr;
5612           byte_after_cr = -1;
5613         }
5614       else
5615         {
5616           ONE_MORE_BYTE (c);
5617           if (eol_crlf && c == '\r')
5618             ONE_MORE_BYTE (byte_after_cr);
5619         }
5620       if (c < 0)
5621         goto invalid_code;
5622       code = c;
5623
5624       val = AREF (valids, c);
5625       if (! INTEGERP (val) && ! CONSP (val))
5626         goto invalid_code;
5627       if (INTEGERP (val))
5628         {
5629           charset = CHARSET_FROM_ID (XFASTINT (val));
5630           dim = CHARSET_DIMENSION (charset);
5631           while (len < dim)
5632             {
5633               ONE_MORE_BYTE (c);
5634               code = (code << 8) | c;
5635               len++;
5636             }
5637           CODING_DECODE_CHAR (coding, src, src_base, src_end,
5638                               charset, code, c);
5639         }
5640       else
5641         {
5642           /* VAL is a list of charset IDs.  It is assured that the
5643              list is sorted by charset dimensions (smaller one
5644              comes first).  */
5645           while (CONSP (val))
5646             {
5647               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
5648               dim = CHARSET_DIMENSION (charset);
5649               while (len < dim)
5650                 {
5651                   ONE_MORE_BYTE (c);
5652                   code = (code << 8) | c;
5653                   len++;
5654                 }
5655               CODING_DECODE_CHAR (coding, src, src_base,
5656                                   src_end, charset, code, c);
5657               if (c >= 0)
5658                 break;
5659               val = XCDR (val);
5660             }
5661         }
5662       if (c < 0)
5663         goto invalid_code;
5664       if (charset->id != charset_ascii
5665           && last_id != charset->id)
5666         {
5667           if (last_id != charset_ascii)
5668             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5669           last_id = charset->id;
5670           last_offset = char_offset;
5671         }
5672
5673       *charbuf++ = c;
5674       char_offset++;
5675       continue;
5676
5677     invalid_code:
5678       src = src_base;
5679       consumed_chars = consumed_chars_base;
5680       ONE_MORE_BYTE (c);
5681       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
5682       char_offset++;
5683       coding->errors++;
5684     }
5685
5686  no_more_source:
5687   if (last_id != charset_ascii)
5688     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
5689   coding->consumed_char += consumed_chars_base;
5690   coding->consumed = src_base - coding->source;
5691   coding->charbuf_used = charbuf - coding->charbuf;
5692 }
5693
5694 static int
5695 encode_coding_charset (coding)
5696      struct coding_system *coding;
5697 {
5698   int multibytep = coding->dst_multibyte;
5699   int *charbuf = coding->charbuf;
5700   int *charbuf_end = charbuf + coding->charbuf_used;
5701   unsigned char *dst = coding->destination + coding->produced;
5702   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5703   int safe_room = MAX_MULTIBYTE_LENGTH;
5704   int produced_chars = 0;
5705   Lisp_Object attrs, charset_list;
5706   int ascii_compatible;
5707   int c;
5708
5709   CODING_GET_INFO (coding, attrs, charset_list);
5710   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
5711
5712   while (charbuf < charbuf_end)
5713     {
5714       struct charset *charset;
5715       unsigned code;
5716
5717       ASSURE_DESTINATION (safe_room);
5718       c = *charbuf++;
5719       if (ascii_compatible && ASCII_CHAR_P (c))
5720         EMIT_ONE_ASCII_BYTE (c);
5721       else if (CHAR_BYTE8_P (c))
5722         {
5723           c = CHAR_TO_BYTE8 (c);
5724           EMIT_ONE_BYTE (c);
5725         }
5726       else
5727         {
5728           charset = char_charset (c, charset_list, &code);
5729           if (charset)
5730             {
5731               if (CHARSET_DIMENSION (charset) == 1)
5732                 EMIT_ONE_BYTE (code);
5733               else if (CHARSET_DIMENSION (charset) == 2)
5734                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
5735               else if (CHARSET_DIMENSION (charset) == 3)
5736                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
5737               else
5738                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
5739                                  (code >> 8) & 0xFF, code & 0xFF);
5740             }
5741           else
5742             {
5743               if (coding->mode & CODING_MODE_SAFE_ENCODING)
5744                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
5745               else
5746                 c = coding->default_char;
5747               EMIT_ONE_BYTE (c);
5748             }
5749         }
5750     }
5751
5752   record_conversion_result (coding, CODING_RESULT_SUCCESS);
5753   coding->produced_char += produced_chars;
5754   coding->produced = dst - coding->destination;
5755   return 0;
5756 }
5757
5758 \f
5759 /*** 7. C library functions ***/
5760
5761 /* Setup coding context CODING from information about CODING_SYSTEM.
5762    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
5763    CODING_SYSTEM is invalid, signal an error.  */
5764
5765 void
5766 setup_coding_system (coding_system, coding)
5767      Lisp_Object coding_system;
5768      struct coding_system *coding;
5769 {
5770   Lisp_Object attrs;
5771   Lisp_Object eol_type;
5772   Lisp_Object coding_type;
5773   Lisp_Object val;
5774
5775   if (NILP (coding_system))
5776     coding_system = Qundecided;
5777
5778   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
5779
5780   attrs = CODING_ID_ATTRS (coding->id);
5781   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
5782
5783   coding->mode = 0;
5784   coding->head_ascii = -1;
5785   if (VECTORP (eol_type))
5786     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5787                             | CODING_REQUIRE_DETECTION_MASK);
5788   else if (! EQ (eol_type, Qunix))
5789     coding->common_flags = (CODING_REQUIRE_DECODING_MASK
5790                             | CODING_REQUIRE_ENCODING_MASK);
5791   else
5792     coding->common_flags = 0;
5793   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5794     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5795   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5796     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5797   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
5798     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
5799
5800   val = CODING_ATTR_SAFE_CHARSETS (attrs);
5801   coding->max_charset_id = SCHARS (val) - 1;
5802   coding->safe_charsets = SDATA (val);
5803   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
5804   coding->carryover_bytes = 0;
5805
5806   coding_type = CODING_ATTR_TYPE (attrs);
5807   if (EQ (coding_type, Qundecided))
5808     {
5809       coding->detector = NULL;
5810       coding->decoder = decode_coding_raw_text;
5811       coding->encoder = encode_coding_raw_text;
5812       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5813     }
5814   else if (EQ (coding_type, Qiso_2022))
5815     {
5816       int i;
5817       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5818
5819       /* Invoke graphic register 0 to plane 0.  */
5820       CODING_ISO_INVOCATION (coding, 0) = 0;
5821       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
5822       CODING_ISO_INVOCATION (coding, 1)
5823         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
5824       /* Setup the initial status of designation.  */
5825       for (i = 0; i < 4; i++)
5826         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5827       /* Not single shifting initially.  */
5828       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5829       /* Beginning of buffer should also be regarded as bol. */
5830       CODING_ISO_BOL (coding) = 1;
5831       coding->detector = detect_coding_iso_2022;
5832       coding->decoder = decode_coding_iso_2022;
5833       coding->encoder = encode_coding_iso_2022;
5834       if (flags & CODING_ISO_FLAG_SAFE)
5835         coding->mode |= CODING_MODE_SAFE_ENCODING;
5836       coding->common_flags
5837         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5838             | CODING_REQUIRE_FLUSHING_MASK);
5839       if (flags & CODING_ISO_FLAG_COMPOSITION)
5840         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5841       if (flags & CODING_ISO_FLAG_DESIGNATION)
5842         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5843       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5844         {
5845           setup_iso_safe_charsets (attrs);
5846           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5847           coding->max_charset_id = SCHARS (val) - 1;
5848           coding->safe_charsets = SDATA (val);
5849         }
5850       CODING_ISO_FLAGS (coding) = flags;
5851       CODING_ISO_CMP_STATUS (coding)->state = COMPOSING_NO;
5852       CODING_ISO_CMP_STATUS (coding)->method = COMPOSITION_NO;
5853       CODING_ISO_EXTSEGMENT_LEN (coding) = 0;
5854       CODING_ISO_EMBEDDED_UTF_8 (coding) = 0;
5855     }
5856   else if (EQ (coding_type, Qcharset))
5857     {
5858       coding->detector = detect_coding_charset;
5859       coding->decoder = decode_coding_charset;
5860       coding->encoder = encode_coding_charset;
5861       coding->common_flags
5862         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5863     }
5864   else if (EQ (coding_type, Qutf_8))
5865     {
5866       val = AREF (attrs, coding_attr_utf_bom);
5867       CODING_UTF_8_BOM (coding) = (CONSP (val) ? utf_detect_bom
5868                                    : EQ (val, Qt) ? utf_with_bom
5869                                    : utf_without_bom);
5870       coding->detector = detect_coding_utf_8;
5871       coding->decoder = decode_coding_utf_8;
5872       coding->encoder = encode_coding_utf_8;
5873       coding->common_flags
5874         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5875       if (CODING_UTF_8_BOM (coding) == utf_detect_bom)
5876         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5877     }
5878   else if (EQ (coding_type, Qutf_16))
5879     {
5880       val = AREF (attrs, coding_attr_utf_bom);
5881       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_detect_bom
5882                                     : EQ (val, Qt) ? utf_with_bom
5883                                     : utf_without_bom);
5884       val = AREF (attrs, coding_attr_utf_16_endian);
5885       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5886                                        : utf_16_little_endian);
5887       CODING_UTF_16_SURROGATE (coding) = 0;
5888       coding->detector = detect_coding_utf_16;
5889       coding->decoder = decode_coding_utf_16;
5890       coding->encoder = encode_coding_utf_16;
5891       coding->common_flags
5892         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5893       if (CODING_UTF_16_BOM (coding) == utf_detect_bom)
5894         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5895     }
5896   else if (EQ (coding_type, Qccl))
5897     {
5898       coding->detector = detect_coding_ccl;
5899       coding->decoder = decode_coding_ccl;
5900       coding->encoder = encode_coding_ccl;
5901       coding->common_flags
5902         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5903             | CODING_REQUIRE_FLUSHING_MASK);
5904     }
5905   else if (EQ (coding_type, Qemacs_mule))
5906     {
5907       coding->detector = detect_coding_emacs_mule;
5908       coding->decoder = decode_coding_emacs_mule;
5909       coding->encoder = encode_coding_emacs_mule;
5910       coding->common_flags
5911         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5912       coding->spec.emacs_mule.full_support = 1;
5913       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5914           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5915         {
5916           Lisp_Object tail, safe_charsets;
5917           int max_charset_id = 0;
5918
5919           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5920                tail = XCDR (tail))
5921             if (max_charset_id < XFASTINT (XCAR (tail)))
5922               max_charset_id = XFASTINT (XCAR (tail));
5923           safe_charsets = make_uninit_string (max_charset_id + 1);
5924           memset (SDATA (safe_charsets), 255, max_charset_id + 1);
5925           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5926                tail = XCDR (tail))
5927             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5928           coding->max_charset_id = max_charset_id;
5929           coding->safe_charsets = SDATA (safe_charsets);
5930           coding->spec.emacs_mule.full_support = 1;
5931         }
5932       coding->spec.emacs_mule.cmp_status.state = COMPOSING_NO;
5933       coding->spec.emacs_mule.cmp_status.method = COMPOSITION_NO;
5934     }
5935   else if (EQ (coding_type, Qshift_jis))
5936     {
5937       coding->detector = detect_coding_sjis;
5938       coding->decoder = decode_coding_sjis;
5939       coding->encoder = encode_coding_sjis;
5940       coding->common_flags
5941         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5942     }
5943   else if (EQ (coding_type, Qbig5))
5944     {
5945       coding->detector = detect_coding_big5;
5946       coding->decoder = decode_coding_big5;
5947       coding->encoder = encode_coding_big5;
5948       coding->common_flags
5949         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5950     }
5951   else                          /* EQ (coding_type, Qraw_text) */
5952     {
5953       coding->detector = NULL;
5954       coding->decoder = decode_coding_raw_text;
5955       coding->encoder = encode_coding_raw_text;
5956       if (! EQ (eol_type, Qunix))
5957         {
5958           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5959           if (! VECTORP (eol_type))
5960             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5961         }
5962
5963     }
5964
5965   return;
5966 }
5967
5968 /* Return a list of charsets supported by CODING.  */
5969
5970 Lisp_Object
5971 coding_charset_list (coding)
5972      struct coding_system *coding;
5973 {
5974   Lisp_Object attrs, charset_list;
5975
5976   CODING_GET_INFO (coding, attrs, charset_list);
5977   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5978     {
5979       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5980
5981       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5982         charset_list = Viso_2022_charset_list;
5983     }
5984   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5985     {
5986       charset_list = Vemacs_mule_charset_list;
5987     }
5988   return charset_list;
5989 }
5990
5991
5992 /* Return a list of charsets supported by CODING-SYSTEM.  */
5993
5994 Lisp_Object
5995 coding_system_charset_list (coding_system)
5996      Lisp_Object coding_system;
5997 {
5998   int id;
5999   Lisp_Object attrs, charset_list;
6000
6001   CHECK_CODING_SYSTEM_GET_ID (coding_system, id);
6002   attrs = CODING_ID_ATTRS (id);
6003
6004   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
6005     {
6006       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
6007
6008       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
6009         charset_list = Viso_2022_charset_list;
6010       else
6011         charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6012     }
6013   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
6014     {
6015       charset_list = Vemacs_mule_charset_list;
6016     }
6017   else
6018     {
6019       charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6020     }
6021   return charset_list;
6022 }
6023
6024
6025 /* Return raw-text or one of its subsidiaries that has the same
6026    eol_type as CODING-SYSTEM.  */
6027
6028 Lisp_Object
6029 raw_text_coding_system (coding_system)
6030      Lisp_Object coding_system;
6031 {
6032   Lisp_Object spec, attrs;
6033   Lisp_Object eol_type, raw_text_eol_type;
6034
6035   if (NILP (coding_system))
6036     return Qraw_text;
6037   spec = CODING_SYSTEM_SPEC (coding_system);
6038   attrs = AREF (spec, 0);
6039
6040   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
6041     return coding_system;
6042
6043   eol_type = AREF (spec, 2);
6044   if (VECTORP (eol_type))
6045     return Qraw_text;
6046   spec = CODING_SYSTEM_SPEC (Qraw_text);
6047   raw_text_eol_type = AREF (spec, 2);
6048   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
6049           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
6050           : AREF (raw_text_eol_type, 2));
6051 }
6052
6053
6054 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
6055    does, return one of the subsidiary that has the same eol-spec as
6056    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
6057    inherit end-of-line format from the system's setting
6058    (system_eol_type).  */
6059
6060 Lisp_Object
6061 coding_inherit_eol_type (coding_system, parent)
6062      Lisp_Object coding_system, parent;
6063 {
6064   Lisp_Object spec, eol_type;
6065
6066   if (NILP (coding_system))
6067     coding_system = Qraw_text;
6068   spec = CODING_SYSTEM_SPEC (coding_system);
6069   eol_type = AREF (spec, 2);
6070   if (VECTORP (eol_type))
6071     {
6072       Lisp_Object parent_eol_type;
6073
6074       if (! NILP (parent))
6075         {
6076           Lisp_Object parent_spec;
6077
6078           parent_spec = CODING_SYSTEM_SPEC (parent);
6079           parent_eol_type = AREF (parent_spec, 2);
6080         }
6081       else
6082         parent_eol_type = system_eol_type;
6083       if (EQ (parent_eol_type, Qunix))
6084         coding_system = AREF (eol_type, 0);
6085       else if (EQ (parent_eol_type, Qdos))
6086         coding_system = AREF (eol_type, 1);
6087       else if (EQ (parent_eol_type, Qmac))
6088         coding_system = AREF (eol_type, 2);
6089     }
6090   return coding_system;
6091 }
6092
6093 /* Emacs has a mechanism to automatically detect a coding system if it
6094    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
6095    it's impossible to distinguish some coding systems accurately
6096    because they use the same range of codes.  So, at first, coding
6097    systems are categorized into 7, those are:
6098
6099    o coding-category-emacs-mule
6100
6101         The category for a coding system which has the same code range
6102         as Emacs' internal format.  Assigned the coding-system (Lisp
6103         symbol) `emacs-mule' by default.
6104
6105    o coding-category-sjis
6106
6107         The category for a coding system which has the same code range
6108         as SJIS.  Assigned the coding-system (Lisp
6109         symbol) `japanese-shift-jis' by default.
6110
6111    o coding-category-iso-7
6112
6113         The category for a coding system which has the same code range
6114         as ISO2022 of 7-bit environment.  This doesn't use any locking
6115         shift and single shift functions.  This can encode/decode all
6116         charsets.  Assigned the coding-system (Lisp symbol)
6117         `iso-2022-7bit' by default.
6118
6119    o coding-category-iso-7-tight
6120
6121         Same as coding-category-iso-7 except that this can
6122         encode/decode only the specified charsets.
6123
6124    o coding-category-iso-8-1
6125
6126         The category for a coding system which has the same code range
6127         as ISO2022 of 8-bit environment and graphic plane 1 used only
6128         for DIMENSION1 charset.  This doesn't use any locking shift
6129         and single shift functions.  Assigned the coding-system (Lisp
6130         symbol) `iso-latin-1' by default.
6131
6132    o coding-category-iso-8-2
6133
6134         The category for a coding system which has the same code range
6135         as ISO2022 of 8-bit environment and graphic plane 1 used only
6136         for DIMENSION2 charset.  This doesn't use any locking shift
6137         and single shift functions.  Assigned the coding-system (Lisp
6138         symbol) `japanese-iso-8bit' by default.
6139
6140    o coding-category-iso-7-else
6141
6142         The category for a coding system which has the same code range
6143         as ISO2022 of 7-bit environemnt but uses locking shift or
6144         single shift functions.  Assigned the coding-system (Lisp
6145         symbol) `iso-2022-7bit-lock' by default.
6146
6147    o coding-category-iso-8-else
6148
6149         The category for a coding system which has the same code range
6150         as ISO2022 of 8-bit environemnt but uses locking shift or
6151         single shift functions.  Assigned the coding-system (Lisp
6152         symbol) `iso-2022-8bit-ss2' by default.
6153
6154    o coding-category-big5
6155
6156         The category for a coding system which has the same code range
6157         as BIG5.  Assigned the coding-system (Lisp symbol)
6158         `cn-big5' by default.
6159
6160    o coding-category-utf-8
6161
6162         The category for a coding system which has the same code range
6163         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
6164         symbol) `utf-8' by default.
6165
6166    o coding-category-utf-16-be
6167
6168         The category for a coding system in which a text has an
6169         Unicode signature (cf. Unicode Standard) in the order of BIG
6170         endian at the head.  Assigned the coding-system (Lisp symbol)
6171         `utf-16-be' by default.
6172
6173    o coding-category-utf-16-le
6174
6175         The category for a coding system in which a text has an
6176         Unicode signature (cf. Unicode Standard) in the order of
6177         LITTLE endian at the head.  Assigned the coding-system (Lisp
6178         symbol) `utf-16-le' by default.
6179
6180    o coding-category-ccl
6181
6182         The category for a coding system of which encoder/decoder is
6183         written in CCL programs.  The default value is nil, i.e., no
6184         coding system is assigned.
6185
6186    o coding-category-binary
6187
6188         The category for a coding system not categorized in any of the
6189         above.  Assigned the coding-system (Lisp symbol)
6190         `no-conversion' by default.
6191
6192    Each of them is a Lisp symbol and the value is an actual
6193    `coding-system's (this is also a Lisp symbol) assigned by a user.
6194    What Emacs does actually is to detect a category of coding system.
6195    Then, it uses a `coding-system' assigned to it.  If Emacs can't
6196    decide only one possible category, it selects a category of the
6197    highest priority.  Priorities of categories are also specified by a
6198    user in a Lisp variable `coding-category-list'.
6199
6200 */
6201
6202 #define EOL_SEEN_NONE   0
6203 #define EOL_SEEN_LF     1
6204 #define EOL_SEEN_CR     2
6205 #define EOL_SEEN_CRLF   4
6206
6207 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
6208    SOURCE is encoded.  If CATEGORY is one of
6209    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
6210    two-byte, else they are encoded by one-byte.
6211
6212    Return one of EOL_SEEN_XXX.  */
6213
6214 #define MAX_EOL_CHECK_COUNT 3
6215
6216 static int
6217 detect_eol (source, src_bytes, category)
6218      const unsigned char *source;
6219      EMACS_INT src_bytes;
6220      enum coding_category category;
6221 {
6222   const unsigned char *src = source, *src_end = src + src_bytes;
6223   unsigned char c;
6224   int total  = 0;
6225   int eol_seen = EOL_SEEN_NONE;
6226
6227   if ((1 << category) & CATEGORY_MASK_UTF_16)
6228     {
6229       int msb, lsb;
6230
6231       msb = category == (coding_category_utf_16_le
6232                          | coding_category_utf_16_le_nosig);
6233       lsb = 1 - msb;
6234
6235       while (src + 1 < src_end)
6236         {
6237           c = src[lsb];
6238           if (src[msb] == 0 && (c == '\n' || c == '\r'))
6239             {
6240               int this_eol;
6241
6242               if (c == '\n')
6243                 this_eol = EOL_SEEN_LF;
6244               else if (src + 3 >= src_end
6245                        || src[msb + 2] != 0
6246                        || src[lsb + 2] != '\n')
6247                 this_eol = EOL_SEEN_CR;
6248               else
6249                 {
6250                   this_eol = EOL_SEEN_CRLF;
6251                   src += 2;
6252                 }
6253
6254               if (eol_seen == EOL_SEEN_NONE)
6255                 /* This is the first end-of-line.  */
6256                 eol_seen = this_eol;
6257               else if (eol_seen != this_eol)
6258                 {
6259                   /* The found type is different from what found before.
6260                      Allow for stray ^M characters in DOS EOL files.  */
6261                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6262                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6263                     eol_seen = EOL_SEEN_CRLF;
6264                   else
6265                     {
6266                       eol_seen = EOL_SEEN_LF;
6267                       break;
6268                     }
6269                 }
6270               if (++total == MAX_EOL_CHECK_COUNT)
6271                 break;
6272             }
6273           src += 2;
6274         }
6275     }
6276   else
6277     {
6278       while (src < src_end)
6279         {
6280           c = *src++;
6281           if (c == '\n' || c == '\r')
6282             {
6283               int this_eol;
6284
6285               if (c == '\n')
6286                 this_eol = EOL_SEEN_LF;
6287               else if (src >= src_end || *src != '\n')
6288                 this_eol = EOL_SEEN_CR;
6289               else
6290                 this_eol = EOL_SEEN_CRLF, src++;
6291
6292               if (eol_seen == EOL_SEEN_NONE)
6293                 /* This is the first end-of-line.  */
6294                 eol_seen = this_eol;
6295               else if (eol_seen != this_eol)
6296                 {
6297                   /* The found type is different from what found before.
6298                      Allow for stray ^M characters in DOS EOL files.  */
6299                   if (eol_seen == EOL_SEEN_CR && this_eol == EOL_SEEN_CRLF
6300                       || eol_seen == EOL_SEEN_CRLF && this_eol == EOL_SEEN_CR)
6301                     eol_seen = EOL_SEEN_CRLF;
6302                   else
6303                     {
6304                       eol_seen = EOL_SEEN_LF;
6305                       break;
6306                     }
6307                 }
6308               if (++total == MAX_EOL_CHECK_COUNT)
6309                 break;
6310             }
6311         }
6312     }
6313   return eol_seen;
6314 }
6315
6316
6317 static Lisp_Object
6318 adjust_coding_eol_type (coding, eol_seen)
6319      struct coding_system *coding;
6320      int eol_seen;
6321 {
6322   Lisp_Object eol_type;
6323
6324   eol_type = CODING_ID_EOL_TYPE (coding->id);
6325   if (eol_seen & EOL_SEEN_LF)
6326     {
6327       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
6328       eol_type = Qunix;
6329     }
6330   else if (eol_seen & EOL_SEEN_CRLF)
6331     {
6332       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
6333       eol_type = Qdos;
6334     }
6335   else if (eol_seen & EOL_SEEN_CR)
6336     {
6337       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
6338       eol_type = Qmac;
6339     }
6340   return eol_type;
6341 }
6342
6343 /* Detect how a text specified in CODING is encoded.  If a coding
6344    system is detected, update fields of CODING by the detected coding
6345    system.  */
6346
6347 void
6348 detect_coding (coding)
6349      struct coding_system *coding;
6350 {
6351   const unsigned char *src, *src_end;
6352   int saved_mode = coding->mode;
6353
6354   coding->consumed = coding->consumed_char = 0;
6355   coding->produced = coding->produced_char = 0;
6356   coding_set_source (coding);
6357
6358   src_end = coding->source + coding->src_bytes;
6359   coding->head_ascii = 0;
6360
6361   /* If we have not yet decided the text encoding type, detect it
6362      now.  */
6363   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
6364     {
6365       int c, i;
6366       struct coding_detection_info detect_info;
6367       int null_byte_found = 0, eight_bit_found = 0;
6368
6369       detect_info.checked = detect_info.found = detect_info.rejected = 0;
6370       for (src = coding->source; src < src_end; src++)
6371         {
6372           c = *src;
6373           if (c & 0x80)
6374             {
6375               eight_bit_found = 1;
6376               if (null_byte_found)
6377                 break;
6378             }
6379           else if (c < 0x20)
6380             {
6381               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
6382                   && ! inhibit_iso_escape_detection
6383                   && ! detect_info.checked)
6384                 {
6385                   if (detect_coding_iso_2022 (coding, &detect_info))
6386                     {
6387                       /* We have scanned the whole data.  */
6388                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
6389                         {
6390                           /* We didn't find an 8-bit code.  We may
6391                              have found a null-byte, but it's very
6392                              rare that a binary file confirm to
6393                              ISO-2022.  */
6394                           src = src_end;
6395                           coding->head_ascii = src - coding->source;
6396                         }
6397                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
6398                       break;
6399                     }
6400                 }
6401               else if (! c && !inhibit_null_byte_detection)
6402                 {
6403                   null_byte_found = 1;
6404                   if (eight_bit_found)
6405                     break;
6406                 }
6407               if (! eight_bit_found)
6408                 coding->head_ascii++;
6409             }
6410           else if (! eight_bit_found)
6411             coding->head_ascii++;
6412         }
6413
6414       if (null_byte_found || eight_bit_found
6415           || coding->head_ascii < coding->src_bytes
6416           || detect_info.found)
6417         {
6418           enum coding_category category;
6419           struct coding_system *this;
6420
6421           if (coding->head_ascii == coding->src_bytes)
6422             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
6423             for (i = 0; i < coding_category_raw_text; i++)
6424               {
6425                 category = coding_priorities[i];
6426                 this = coding_categories + category;
6427                 if (detect_info.found & (1 << category))
6428                   break;
6429               }
6430           else
6431             {
6432               if (null_byte_found)
6433                 {
6434                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
6435                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
6436                 }
6437               for (i = 0; i < coding_category_raw_text; i++)
6438                 {
6439                   category = coding_priorities[i];
6440                   this = coding_categories + category;
6441                   if (this->id < 0)
6442                     {
6443                       /* No coding system of this category is defined.  */
6444                       detect_info.rejected |= (1 << category);
6445                     }
6446                   else if (category >= coding_category_raw_text)
6447                     continue;
6448                   else if (detect_info.checked & (1 << category))
6449                     {
6450                       if (detect_info.found & (1 << category))
6451                         break;
6452                     }
6453                   else if ((*(this->detector)) (coding, &detect_info)
6454                            && detect_info.found & (1 << category))
6455                     {
6456                       if (category == coding_category_utf_16_auto)
6457                         {
6458                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6459                             category = coding_category_utf_16_le;
6460                           else
6461                             category = coding_category_utf_16_be;
6462                         }
6463                       break;
6464                     }
6465                 }
6466             }
6467
6468           if (i < coding_category_raw_text)
6469             setup_coding_system (CODING_ID_NAME (this->id), coding);
6470           else if (null_byte_found)
6471             setup_coding_system (Qno_conversion, coding);
6472           else if ((detect_info.rejected & CATEGORY_MASK_ANY)
6473                    == CATEGORY_MASK_ANY)
6474             setup_coding_system (Qraw_text, coding);
6475           else if (detect_info.rejected)
6476             for (i = 0; i < coding_category_raw_text; i++)
6477               if (! (detect_info.rejected & (1 << coding_priorities[i])))
6478                 {
6479                   this = coding_categories + coding_priorities[i];
6480                   setup_coding_system (CODING_ID_NAME (this->id), coding);
6481                   break;
6482                 }
6483         }
6484     }
6485   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6486            == coding_category_utf_8_auto)
6487     {
6488       Lisp_Object coding_systems;
6489       struct coding_detection_info detect_info;
6490
6491       coding_systems
6492         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6493       detect_info.found = detect_info.rejected = 0;
6494       coding->head_ascii = 0;
6495       if (CONSP (coding_systems)
6496           && detect_coding_utf_8 (coding, &detect_info))
6497         {
6498           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
6499             setup_coding_system (XCAR (coding_systems), coding);
6500           else
6501             setup_coding_system (XCDR (coding_systems), coding);
6502         }
6503     }
6504   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
6505            == coding_category_utf_16_auto)
6506     {
6507       Lisp_Object coding_systems;
6508       struct coding_detection_info detect_info;
6509
6510       coding_systems
6511         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_bom);
6512       detect_info.found = detect_info.rejected = 0;
6513       coding->head_ascii = 0;
6514       if (CONSP (coding_systems)
6515           && detect_coding_utf_16 (coding, &detect_info))
6516         {
6517           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
6518             setup_coding_system (XCAR (coding_systems), coding);
6519           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
6520             setup_coding_system (XCDR (coding_systems), coding);
6521         }
6522     }
6523   coding->mode = saved_mode;
6524 }
6525
6526
6527 static void
6528 decode_eol (coding)
6529      struct coding_system *coding;
6530 {
6531   Lisp_Object eol_type;
6532   unsigned char *p, *pbeg, *pend;
6533
6534   eol_type = CODING_ID_EOL_TYPE (coding->id);
6535   if (EQ (eol_type, Qunix) || inhibit_eol_conversion)
6536     return;
6537
6538   if (NILP (coding->dst_object))
6539     pbeg = coding->destination;
6540   else
6541     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
6542   pend = pbeg + coding->produced;
6543
6544   if (VECTORP (eol_type))
6545     {
6546       int eol_seen = EOL_SEEN_NONE;
6547
6548       for (p = pbeg; p < pend; p++)
6549         {
6550           if (*p == '\n')
6551             eol_seen |= EOL_SEEN_LF;
6552           else if (*p == '\r')
6553             {
6554               if (p + 1 < pend && *(p + 1) == '\n')
6555                 {
6556                   eol_seen |= EOL_SEEN_CRLF;
6557                   p++;
6558                 }
6559               else
6560                 eol_seen |= EOL_SEEN_CR;
6561             }
6562         }
6563       /* Handle DOS-style EOLs in a file with stray ^M characters.  */
6564       if ((eol_seen & EOL_SEEN_CRLF) != 0
6565           && (eol_seen & EOL_SEEN_CR) != 0
6566           && (eol_seen & EOL_SEEN_LF) == 0)
6567         eol_seen = EOL_SEEN_CRLF;
6568       else if (eol_seen != EOL_SEEN_NONE
6569           && eol_seen != EOL_SEEN_LF
6570           && eol_seen != EOL_SEEN_CRLF
6571           && eol_seen != EOL_SEEN_CR)
6572         eol_seen = EOL_SEEN_LF;
6573       if (eol_seen != EOL_SEEN_NONE)
6574         eol_type = adjust_coding_eol_type (coding, eol_seen);
6575     }
6576
6577   if (EQ (eol_type, Qmac))
6578     {
6579       for (p = pbeg; p < pend; p++)
6580         if (*p == '\r')
6581           *p = '\n';
6582     }
6583   else if (EQ (eol_type, Qdos))
6584     {
6585       int n = 0;
6586
6587       if (NILP (coding->dst_object))
6588         {
6589           /* Start deleting '\r' from the tail to minimize the memory
6590              movement.  */
6591           for (p = pend - 2; p >= pbeg; p--)
6592             if (*p == '\r')
6593               {
6594                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
6595                 n++;
6596               }
6597         }
6598       else
6599         {
6600           int pos_byte = coding->dst_pos_byte;
6601           int pos = coding->dst_pos;
6602           int pos_end = pos + coding->produced_char - 1;
6603
6604           while (pos < pos_end)
6605             {
6606               p = BYTE_POS_ADDR (pos_byte);
6607               if (*p == '\r' && p[1] == '\n')
6608                 {
6609                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
6610                   n++;
6611                   pos_end--;
6612                 }
6613               pos++;
6614               if (coding->dst_multibyte)
6615                 pos_byte += BYTES_BY_CHAR_HEAD (*p);
6616               else
6617                 pos_byte++;
6618             }
6619         }
6620       coding->produced -= n;
6621       coding->produced_char -= n;
6622     }
6623 }
6624
6625
6626 /* Return a translation table (or list of them) from coding system
6627    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
6628    decoding (ENCODEP is zero). */
6629
6630 static Lisp_Object
6631 get_translation_table (attrs, encodep, max_lookup)
6632      Lisp_Object attrs;
6633      int encodep, *max_lookup;
6634 {
6635   Lisp_Object standard, translation_table;
6636   Lisp_Object val;
6637
6638   if (NILP (Venable_character_translation))
6639     {
6640       if (max_lookup)
6641         *max_lookup = 0;
6642       return Qnil;
6643     }
6644   if (encodep)
6645     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
6646       standard = Vstandard_translation_table_for_encode;
6647   else
6648     translation_table = CODING_ATTR_DECODE_TBL (attrs),
6649       standard = Vstandard_translation_table_for_decode;
6650   if (NILP (translation_table))
6651     translation_table = standard;
6652   else
6653     {
6654       if (SYMBOLP (translation_table))
6655         translation_table = Fget (translation_table, Qtranslation_table);
6656       else if (CONSP (translation_table))
6657         {
6658           translation_table = Fcopy_sequence (translation_table);
6659           for (val = translation_table; CONSP (val); val = XCDR (val))
6660             if (SYMBOLP (XCAR (val)))
6661               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
6662         }
6663       if (CHAR_TABLE_P (standard))
6664         {
6665           if (CONSP (translation_table))
6666             translation_table = nconc2 (translation_table,
6667                                         Fcons (standard, Qnil));
6668           else
6669             translation_table = Fcons (translation_table,
6670                                        Fcons (standard, Qnil));
6671         }
6672     }
6673
6674   if (max_lookup)
6675     {
6676       *max_lookup = 1;
6677       if (CHAR_TABLE_P (translation_table)
6678           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
6679         {
6680           val = XCHAR_TABLE (translation_table)->extras[1];
6681           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6682             *max_lookup = XFASTINT (val);
6683         }
6684       else if (CONSP (translation_table))
6685         {
6686           Lisp_Object tail, val;
6687
6688           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
6689             if (CHAR_TABLE_P (XCAR (tail))
6690                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
6691               {
6692                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
6693                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
6694                   *max_lookup = XFASTINT (val);
6695               }
6696         }
6697     }
6698   return translation_table;
6699 }
6700
6701 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
6702   do {                                                          \
6703     trans = Qnil;                                               \
6704     if (CHAR_TABLE_P (table))                                   \
6705       {                                                         \
6706         trans = CHAR_TABLE_REF (table, c);                      \
6707         if (CHARACTERP (trans))                                 \
6708           c = XFASTINT (trans), trans = Qnil;                   \
6709       }                                                         \
6710     else if (CONSP (table))                                     \
6711       {                                                         \
6712         Lisp_Object tail;                                       \
6713                                                                 \
6714         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
6715           if (CHAR_TABLE_P (XCAR (tail)))                       \
6716             {                                                   \
6717               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
6718               if (CHARACTERP (trans))                           \
6719                 c = XFASTINT (trans), trans = Qnil;             \
6720               else if (! NILP (trans))                          \
6721                 break;                                          \
6722             }                                                   \
6723       }                                                         \
6724   } while (0)
6725
6726
6727 /* Return a translation of character(s) at BUF according to TRANS.
6728    TRANS is TO-CHAR or ((FROM .  TO) ...) where
6729    FROM = [FROM-CHAR ...], TO is TO-CHAR or [TO-CHAR ...].
6730    The return value is TO-CHAR or ([FROM-CHAR ...] . TO) if a
6731    translation is found, and Qnil if not found..
6732    If BUF is too short to lookup characters in FROM, return Qt.  */
6733
6734 static Lisp_Object
6735 get_translation (trans, buf, buf_end)
6736      Lisp_Object trans;
6737      int *buf, *buf_end;
6738 {
6739
6740   if (INTEGERP (trans))
6741     return trans;
6742   for (; CONSP (trans); trans = XCDR (trans))
6743     {
6744       Lisp_Object val = XCAR (trans);
6745       Lisp_Object from = XCAR (val);
6746       int len = ASIZE (from);
6747       int i;
6748
6749       for (i = 0; i < len; i++)
6750         {
6751           if (buf + i == buf_end)
6752             return Qt;
6753           if (XINT (AREF (from, i)) != buf[i])
6754             break;
6755         }
6756       if (i == len)
6757         return val;
6758     }
6759   return Qnil;
6760 }
6761
6762
6763 static int
6764 produce_chars (coding, translation_table, last_block)
6765      struct coding_system *coding;
6766      Lisp_Object translation_table;
6767      int last_block;
6768 {
6769   unsigned char *dst = coding->destination + coding->produced;
6770   unsigned char *dst_end = coding->destination + coding->dst_bytes;
6771   EMACS_INT produced;
6772   EMACS_INT produced_chars = 0;
6773   int carryover = 0;
6774
6775   if (! coding->chars_at_source)
6776     {
6777       /* Source characters are in coding->charbuf.  */
6778       int *buf = coding->charbuf;
6779       int *buf_end = buf + coding->charbuf_used;
6780
6781       if (EQ (coding->src_object, coding->dst_object))
6782         {
6783           coding_set_source (coding);
6784           dst_end = ((unsigned char *) coding->source) + coding->consumed;
6785         }
6786
6787       while (buf < buf_end)
6788         {
6789           int c = *buf, i;
6790
6791           if (c >= 0)
6792             {
6793               int from_nchars = 1, to_nchars = 1;
6794               Lisp_Object trans = Qnil;
6795
6796               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6797               if (! NILP (trans))
6798                 {
6799                   trans = get_translation (trans, buf, buf_end);
6800                   if (INTEGERP (trans))
6801                     c = XINT (trans);
6802                   else if (CONSP (trans))
6803                     {
6804                       from_nchars = ASIZE (XCAR (trans));
6805                       trans = XCDR (trans);
6806                       if (INTEGERP (trans))
6807                         c = XINT (trans);
6808                       else
6809                         {
6810                           to_nchars = ASIZE (trans);
6811                           c = XINT (AREF (trans, 0));
6812                         }
6813                     }
6814                   else if (EQ (trans, Qt) && ! last_block)
6815                     break;
6816                 }
6817
6818               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
6819                 {
6820                   dst = alloc_destination (coding,
6821                                            buf_end - buf
6822                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
6823                                            dst);
6824                   if (EQ (coding->src_object, coding->dst_object))
6825                     {
6826                       coding_set_source (coding);
6827                       dst_end = (((unsigned char *) coding->source)
6828                                  + coding->consumed);
6829                     }
6830                   else
6831                     dst_end = coding->destination + coding->dst_bytes;
6832                 }
6833
6834               for (i = 0; i < to_nchars; i++)
6835                 {
6836                   if (i > 0)
6837                     c = XINT (AREF (trans, i));
6838                   if (coding->dst_multibyte
6839                       || ! CHAR_BYTE8_P (c))
6840                     CHAR_STRING_ADVANCE_NO_UNIFY (c, dst);
6841                   else
6842                     *dst++ = CHAR_TO_BYTE8 (c);
6843                 }
6844               produced_chars += to_nchars;
6845               buf += from_nchars;
6846             }
6847           else
6848             /* This is an annotation datum.  (-C) is the length.  */
6849             buf += -c;
6850         }
6851       carryover = buf_end - buf;
6852     }
6853   else
6854     {
6855       /* Source characters are at coding->source.  */
6856       const unsigned char *src = coding->source;
6857       const unsigned char *src_end = src + coding->consumed;
6858
6859       if (EQ (coding->dst_object, coding->src_object))
6860         dst_end = (unsigned char *) src;
6861       if (coding->src_multibyte != coding->dst_multibyte)
6862         {
6863           if (coding->src_multibyte)
6864             {
6865               int multibytep = 1;
6866               EMACS_INT consumed_chars = 0;
6867
6868               while (1)
6869                 {
6870                   const unsigned char *src_base = src;
6871                   int c;
6872
6873                   ONE_MORE_BYTE (c);
6874                   if (dst == dst_end)
6875                     {
6876                       if (EQ (coding->src_object, coding->dst_object))
6877                         dst_end = (unsigned char *) src;
6878                       if (dst == dst_end)
6879                         {
6880                           EMACS_INT offset = src - coding->source;
6881
6882                           dst = alloc_destination (coding, src_end - src + 1,
6883                                                    dst);
6884                           dst_end = coding->destination + coding->dst_bytes;
6885                           coding_set_source (coding);
6886                           src = coding->source + offset;
6887                           src_end = coding->source + coding->src_bytes;
6888                           if (EQ (coding->src_object, coding->dst_object))
6889                             dst_end = (unsigned char *) src;
6890                         }
6891                     }
6892                   *dst++ = c;
6893                   produced_chars++;
6894                 }
6895             no_more_source:
6896               ;
6897             }
6898           else
6899             while (src < src_end)
6900               {
6901                 int multibytep = 1;
6902                 int c = *src++;
6903
6904                 if (dst >= dst_end - 1)
6905                   {
6906                     if (EQ (coding->src_object, coding->dst_object))
6907                       dst_end = (unsigned char *) src;
6908                     if (dst >= dst_end - 1)
6909                       {
6910                         EMACS_INT offset = src - coding->source;
6911                         EMACS_INT more_bytes;
6912
6913                         if (EQ (coding->src_object, coding->dst_object))
6914                           more_bytes = ((src_end - src) / 2) + 2;
6915                         else
6916                           more_bytes = src_end - src + 2;
6917                         dst = alloc_destination (coding, more_bytes, dst);
6918                         dst_end = coding->destination + coding->dst_bytes;
6919                         coding_set_source (coding);
6920                         src = coding->source + offset;
6921                         src_end = coding->source + coding->src_bytes;
6922                         if (EQ (coding->src_object, coding->dst_object))
6923                           dst_end = (unsigned char *) src;
6924                       }
6925                   }
6926                 EMIT_ONE_BYTE (c);
6927               }
6928         }
6929       else
6930         {
6931           if (!EQ (coding->src_object, coding->dst_object))
6932             {
6933               EMACS_INT require = coding->src_bytes - coding->dst_bytes;
6934
6935               if (require > 0)
6936                 {
6937                   EMACS_INT offset = src - coding->source;
6938
6939                   dst = alloc_destination (coding, require, dst);
6940                   coding_set_source (coding);
6941                   src = coding->source + offset;
6942                   src_end = coding->source + coding->src_bytes;
6943                 }
6944             }
6945           produced_chars = coding->consumed_char;
6946           while (src < src_end)
6947             *dst++ = *src++;
6948         }
6949     }
6950
6951   produced = dst - (coding->destination + coding->produced);
6952   if (BUFFERP (coding->dst_object) && produced_chars > 0)
6953     insert_from_gap (produced_chars, produced);
6954   coding->produced += produced;
6955   coding->produced_char += produced_chars;
6956   return carryover;
6957 }
6958
6959 /* Compose text in CODING->object according to the annotation data at
6960    CHARBUF.  CHARBUF is an array:
6961      [ -LENGTH ANNOTATION_MASK NCHARS NBYTES METHOD [ COMPONENTS... ] ]
6962  */
6963
6964 static INLINE void
6965 produce_composition (coding, charbuf, pos)
6966      struct coding_system *coding;
6967      int *charbuf;
6968      EMACS_INT pos;
6969 {
6970   int len;
6971   EMACS_INT to;
6972   enum composition_method method;
6973   Lisp_Object components;
6974
6975   len = -charbuf[0] - MAX_ANNOTATION_LENGTH;
6976   to = pos + charbuf[2];
6977   method = (enum composition_method) (charbuf[4]);
6978
6979   if (method == COMPOSITION_RELATIVE)
6980     components = Qnil;
6981   else
6982     {
6983       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6984       int i, j;
6985
6986       if (method == COMPOSITION_WITH_RULE)
6987         len = charbuf[2] * 3 - 2;
6988       charbuf += MAX_ANNOTATION_LENGTH;
6989       /* charbuf = [ CHRA ... CHAR] or [ CHAR -2 RULE ... CHAR ] */
6990       for (i = j = 0; i < len && charbuf[i] != -1; i++, j++)
6991         {
6992           if (charbuf[i] >= 0)
6993             args[j] = make_number (charbuf[i]);
6994           else
6995             {
6996               i++;
6997               args[j] = make_number (charbuf[i] % 0x100);
6998             }
6999         }
7000       components = (i == j ? Fstring (j, args) : Fvector (j, args));
7001     }
7002   compose_text (pos, to, components, Qnil, coding->dst_object);
7003 }
7004
7005
7006 /* Put `charset' property on text in CODING->object according to
7007    the annotation data at CHARBUF.  CHARBUF is an array:
7008      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
7009  */
7010
7011 static INLINE void
7012 produce_charset (coding, charbuf, pos)
7013      struct coding_system *coding;
7014      int *charbuf;
7015      EMACS_INT pos;
7016 {
7017   EMACS_INT from = pos - charbuf[2];
7018   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
7019
7020   Fput_text_property (make_number (from), make_number (pos),
7021                       Qcharset, CHARSET_NAME (charset),
7022                       coding->dst_object);
7023 }
7024
7025
7026 #define CHARBUF_SIZE 0x4000
7027
7028 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
7029   do {                                                                  \
7030     int size = CHARBUF_SIZE;                                            \
7031                                                                         \
7032     coding->charbuf = NULL;                                             \
7033     while (size > 1024)                                                 \
7034       {                                                                 \
7035         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
7036         if (coding->charbuf)                                            \
7037           break;                                                        \
7038         size >>= 1;                                                     \
7039       }                                                                 \
7040     if (! coding->charbuf)                                              \
7041       {                                                                 \
7042         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
7043         return coding->result;                                          \
7044       }                                                                 \
7045     coding->charbuf_size = size;                                        \
7046   } while (0)
7047
7048
7049 static void
7050 produce_annotation (coding, pos)
7051      struct coding_system *coding;
7052      EMACS_INT pos;
7053 {
7054   int *charbuf = coding->charbuf;
7055   int *charbuf_end = charbuf + coding->charbuf_used;
7056
7057   if (NILP (coding->dst_object))
7058     return;
7059
7060   while (charbuf < charbuf_end)
7061     {
7062       if (*charbuf >= 0)
7063         pos++, charbuf++;
7064       else
7065         {
7066           int len = -*charbuf;
7067
7068           if (len > 2)
7069             switch (charbuf[1])
7070               {
7071               case CODING_ANNOTATE_COMPOSITION_MASK:
7072                 produce_composition (coding, charbuf, pos);
7073                 break;
7074               case CODING_ANNOTATE_CHARSET_MASK:
7075                 produce_charset (coding, charbuf, pos);
7076                 break;
7077               }
7078           charbuf += len;
7079         }
7080     }
7081 }
7082
7083 /* Decode the data at CODING->src_object into CODING->dst_object.
7084    CODING->src_object is a buffer, a string, or nil.
7085    CODING->dst_object is a buffer.
7086
7087    If CODING->src_object is a buffer, it must be the current buffer.
7088    In this case, if CODING->src_pos is positive, it is a position of
7089    the source text in the buffer, otherwise, the source text is in the
7090    gap area of the buffer, and CODING->src_pos specifies the offset of
7091    the text from GPT (which must be the same as PT).  If this is the
7092    same buffer as CODING->dst_object, CODING->src_pos must be
7093    negative.
7094
7095    If CODING->src_object is a string, CODING->src_pos is an index to
7096    that string.
7097
7098    If CODING->src_object is nil, CODING->source must already point to
7099    the non-relocatable memory area.  In this case, CODING->src_pos is
7100    an offset from CODING->source.
7101
7102    The decoded data is inserted at the current point of the buffer
7103    CODING->dst_object.
7104 */
7105
7106 static int
7107 decode_coding (coding)
7108      struct coding_system *coding;
7109 {
7110   Lisp_Object attrs;
7111   Lisp_Object undo_list;
7112   Lisp_Object translation_table;
7113   int carryover;
7114   int i;
7115
7116   if (BUFFERP (coding->src_object)
7117       && coding->src_pos > 0
7118       && coding->src_pos < GPT
7119       && coding->src_pos + coding->src_chars > GPT)
7120     move_gap_both (coding->src_pos, coding->src_pos_byte);
7121
7122   undo_list = Qt;
7123   if (BUFFERP (coding->dst_object))
7124     {
7125       if (current_buffer != XBUFFER (coding->dst_object))
7126         set_buffer_internal (XBUFFER (coding->dst_object));
7127       if (GPT != PT)
7128         move_gap_both (PT, PT_BYTE);
7129       undo_list = current_buffer->undo_list;
7130       current_buffer->undo_list = Qt;
7131     }
7132
7133   coding->consumed = coding->consumed_char = 0;
7134   coding->produced = coding->produced_char = 0;
7135   coding->chars_at_source = 0;
7136   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7137   coding->errors = 0;
7138
7139   ALLOC_CONVERSION_WORK_AREA (coding);
7140
7141   attrs = CODING_ID_ATTRS (coding->id);
7142   translation_table = get_translation_table (attrs, 0, NULL);
7143
7144   carryover = 0;
7145   do
7146     {
7147       EMACS_INT pos = coding->dst_pos + coding->produced_char;
7148
7149       coding_set_source (coding);
7150       coding->annotated = 0;
7151       coding->charbuf_used = carryover;
7152       (*(coding->decoder)) (coding);
7153       coding_set_destination (coding);
7154       carryover = produce_chars (coding, translation_table, 0);
7155       if (coding->annotated)
7156         produce_annotation (coding, pos);
7157       for (i = 0; i < carryover; i++)
7158         coding->charbuf[i]
7159           = coding->charbuf[coding->charbuf_used - carryover + i];
7160     }
7161   while (coding->consumed < coding->src_bytes
7162          && (coding->result == CODING_RESULT_SUCCESS
7163              || coding->result == CODING_RESULT_INVALID_SRC));
7164
7165   if (carryover > 0)
7166     {
7167       coding_set_destination (coding);
7168       coding->charbuf_used = carryover;
7169       produce_chars (coding, translation_table, 1);
7170     }
7171
7172   coding->carryover_bytes = 0;
7173   if (coding->consumed < coding->src_bytes)
7174     {
7175       int nbytes = coding->src_bytes - coding->consumed;
7176       const unsigned char *src;
7177
7178       coding_set_source (coding);
7179       coding_set_destination (coding);
7180       src = coding->source + coding->consumed;
7181
7182       if (coding->mode & CODING_MODE_LAST_BLOCK)
7183         {
7184           /* Flush out unprocessed data as binary chars.  We are sure
7185              that the number of data is less than the size of
7186              coding->charbuf.  */
7187           coding->charbuf_used = 0;
7188           coding->chars_at_source = 0;
7189
7190           while (nbytes-- > 0)
7191             {
7192               int c = *src++;
7193
7194               if (c & 0x80)
7195                 c = BYTE8_TO_CHAR (c);
7196               coding->charbuf[coding->charbuf_used++] = c;
7197             }
7198           produce_chars (coding, Qnil, 1);
7199         }
7200       else
7201         {
7202           /* Record unprocessed bytes in coding->carryover.  We are
7203              sure that the number of data is less than the size of
7204              coding->carryover.  */
7205           unsigned char *p = coding->carryover;
7206
7207           if (nbytes > sizeof coding->carryover)
7208             nbytes = sizeof coding->carryover;
7209           coding->carryover_bytes = nbytes;
7210           while (nbytes-- > 0)
7211             *p++ = *src++;
7212         }
7213       coding->consumed = coding->src_bytes;
7214     }
7215
7216   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix)
7217       && !inhibit_eol_conversion)
7218     decode_eol (coding);
7219   if (BUFFERP (coding->dst_object))
7220     {
7221       current_buffer->undo_list = undo_list;
7222       record_insert (coding->dst_pos, coding->produced_char);
7223     }
7224   return coding->result;
7225 }
7226
7227
7228 /* Extract an annotation datum from a composition starting at POS and
7229    ending before LIMIT of CODING->src_object (buffer or string), store
7230    the data in BUF, set *STOP to a starting position of the next
7231    composition (if any) or to LIMIT, and return the address of the
7232    next element of BUF.
7233
7234    If such an annotation is not found, set *STOP to a starting
7235    position of a composition after POS (if any) or to LIMIT, and
7236    return BUF.  */
7237
7238 static INLINE int *
7239 handle_composition_annotation (pos, limit, coding, buf, stop)
7240      EMACS_INT pos, limit;
7241      struct coding_system *coding;
7242      int *buf;
7243      EMACS_INT *stop;
7244 {
7245   EMACS_INT start, end;
7246   Lisp_Object prop;
7247
7248   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
7249       || end > limit)
7250     *stop = limit;
7251   else if (start > pos)
7252     *stop = start;
7253   else
7254     {
7255       if (start == pos)
7256         {
7257           /* We found a composition.  Store the corresponding
7258              annotation data in BUF.  */
7259           int *head = buf;
7260           enum composition_method method = COMPOSITION_METHOD (prop);
7261           int nchars = COMPOSITION_LENGTH (prop);
7262
7263           ADD_COMPOSITION_DATA (buf, nchars, 0, method);
7264           if (method != COMPOSITION_RELATIVE)
7265             {
7266               Lisp_Object components;
7267               int len, i, i_byte;
7268
7269               components = COMPOSITION_COMPONENTS (prop);
7270               if (VECTORP (components))
7271                 {
7272                   len = XVECTOR (components)->size;
7273                   for (i = 0; i < len; i++)
7274                     *buf++ = XINT (AREF (components, i));
7275                 }
7276               else if (STRINGP (components))
7277                 {
7278                   len = SCHARS (components);
7279                   i = i_byte = 0;
7280                   while (i < len)
7281                     {
7282                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
7283                       buf++;
7284                     }
7285                 }
7286               else if (INTEGERP (components))
7287                 {
7288                   len = 1;
7289                   *buf++ = XINT (components);
7290                 }
7291               else if (CONSP (components))
7292                 {
7293                   for (len = 0; CONSP (components);
7294                        len++, components = XCDR (components))
7295                     *buf++ = XINT (XCAR (components));
7296                 }
7297               else
7298                 abort ();
7299               *head -= len;
7300             }
7301         }
7302
7303       if (find_composition (end, limit, &start, &end, &prop,
7304                             coding->src_object)
7305           && end <= limit)
7306         *stop = start;
7307       else
7308         *stop = limit;
7309     }
7310   return buf;
7311 }
7312
7313
7314 /* Extract an annotation datum from a text property `charset' at POS of
7315    CODING->src_object (buffer of string), store the data in BUF, set
7316    *STOP to the position where the value of `charset' property changes
7317    (limiting by LIMIT), and return the address of the next element of
7318    BUF.
7319
7320    If the property value is nil, set *STOP to the position where the
7321    property value is non-nil (limiting by LIMIT), and return BUF.  */
7322
7323 static INLINE int *
7324 handle_charset_annotation (pos, limit, coding, buf, stop)
7325      EMACS_INT pos, limit;
7326      struct coding_system *coding;
7327      int *buf;
7328      EMACS_INT *stop;
7329 {
7330   Lisp_Object val, next;
7331   int id;
7332
7333   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
7334   if (! NILP (val) && CHARSETP (val))
7335     id = XINT (CHARSET_SYMBOL_ID (val));
7336   else
7337     id = -1;
7338   ADD_CHARSET_DATA (buf, 0, id);
7339   next = Fnext_single_property_change (make_number (pos), Qcharset,
7340                                        coding->src_object,
7341                                        make_number (limit));
7342   *stop = XINT (next);
7343   return buf;
7344 }
7345
7346
7347 static void
7348 consume_chars (coding, translation_table, max_lookup)
7349      struct coding_system *coding;
7350      Lisp_Object translation_table;
7351      int max_lookup;
7352 {
7353   int *buf = coding->charbuf;
7354   int *buf_end = coding->charbuf + coding->charbuf_size;
7355   const unsigned char *src = coding->source + coding->consumed;
7356   const unsigned char *src_end = coding->source + coding->src_bytes;
7357   EMACS_INT pos = coding->src_pos + coding->consumed_char;
7358   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
7359   int multibytep = coding->src_multibyte;
7360   Lisp_Object eol_type;
7361   int c;
7362   EMACS_INT stop, stop_composition, stop_charset;
7363   int *lookup_buf = NULL;
7364
7365   if (! NILP (translation_table))
7366     lookup_buf = alloca (sizeof (int) * max_lookup);
7367
7368   eol_type = inhibit_eol_conversion ? Qunix : CODING_ID_EOL_TYPE (coding->id);
7369   if (VECTORP (eol_type))
7370     eol_type = Qunix;
7371
7372   /* Note: composition handling is not yet implemented.  */
7373   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7374
7375   if (NILP (coding->src_object))
7376     stop = stop_composition = stop_charset = end_pos;
7377   else
7378     {
7379       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
7380         stop = stop_composition = pos;
7381       else
7382         stop = stop_composition = end_pos;
7383       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
7384         stop = stop_charset = pos;
7385       else
7386         stop_charset = end_pos;
7387     }
7388
7389   /* Compensate for CRLF and conversion.  */
7390   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
7391   while (buf < buf_end)
7392     {
7393       Lisp_Object trans;
7394
7395       if (pos == stop)
7396         {
7397           if (pos == end_pos)
7398             break;
7399           if (pos == stop_composition)
7400             buf = handle_composition_annotation (pos, end_pos, coding,
7401                                                  buf, &stop_composition);
7402           if (pos == stop_charset)
7403             buf = handle_charset_annotation (pos, end_pos, coding,
7404                                              buf, &stop_charset);
7405           stop = (stop_composition < stop_charset
7406                   ? stop_composition : stop_charset);
7407         }
7408
7409       if (! multibytep)
7410         {
7411           EMACS_INT bytes;
7412
7413           if (coding->encoder == encode_coding_raw_text)
7414             c = *src++, pos++;
7415           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
7416             c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos += bytes;
7417           else
7418             c = BYTE8_TO_CHAR (*src), src++, pos++;
7419         }
7420       else
7421         c = STRING_CHAR_ADVANCE_NO_UNIFY (src), pos++;
7422       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
7423         c = '\n';
7424       if (! EQ (eol_type, Qunix))
7425         {
7426           if (c == '\n')
7427             {
7428               if (EQ (eol_type, Qdos))
7429                 *buf++ = '\r';
7430               else
7431                 c = '\r';
7432             }
7433         }
7434
7435       trans = Qnil;
7436       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
7437       if (NILP (trans))
7438         *buf++ = c;
7439       else
7440         {
7441           int from_nchars = 1, to_nchars = 1;
7442           int *lookup_buf_end;
7443           const unsigned char *p = src;
7444           int i;
7445
7446           lookup_buf[0] = c;
7447           for (i = 1; i < max_lookup && p < src_end; i++)
7448             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
7449           lookup_buf_end = lookup_buf + i;
7450           trans = get_translation (trans, lookup_buf, lookup_buf_end);
7451           if (INTEGERP (trans))
7452             c = XINT (trans);
7453           else if (CONSP (trans))
7454             {
7455               from_nchars = ASIZE (XCAR (trans));
7456               trans = XCDR (trans);
7457               if (INTEGERP (trans))
7458                 c = XINT (trans);
7459               else
7460                 {
7461                   to_nchars = ASIZE (trans);
7462                   if (buf + to_nchars > buf_end)
7463                     break;
7464                   c = XINT (AREF (trans, 0));
7465                 }
7466             }
7467           else
7468             break;
7469           *buf++ = c;
7470           for (i = 1; i < to_nchars; i++)
7471             *buf++ = XINT (AREF (trans, i));
7472           for (i = 1; i < from_nchars; i++, pos++)
7473             src += MULTIBYTE_LENGTH_NO_CHECK (src);
7474         }
7475     }
7476
7477   coding->consumed = src - coding->source;
7478   coding->consumed_char = pos - coding->src_pos;
7479   coding->charbuf_used = buf - coding->charbuf;
7480   coding->chars_at_source = 0;
7481 }
7482
7483
7484 /* Encode the text at CODING->src_object into CODING->dst_object.
7485    CODING->src_object is a buffer or a string.
7486    CODING->dst_object is a buffer or nil.
7487
7488    If CODING->src_object is a buffer, it must be the current buffer.
7489    In this case, if CODING->src_pos is positive, it is a position of
7490    the source text in the buffer, otherwise. the source text is in the
7491    gap area of the buffer, and coding->src_pos specifies the offset of
7492    the text from GPT (which must be the same as PT).  If this is the
7493    same buffer as CODING->dst_object, CODING->src_pos must be
7494    negative and CODING should not have `pre-write-conversion'.
7495
7496    If CODING->src_object is a string, CODING should not have
7497    `pre-write-conversion'.
7498
7499    If CODING->dst_object is a buffer, the encoded data is inserted at
7500    the current point of that buffer.
7501
7502    If CODING->dst_object is nil, the encoded data is placed at the
7503    memory area specified by CODING->destination.  */
7504
7505 static int
7506 encode_coding (coding)
7507      struct coding_system *coding;
7508 {
7509   Lisp_Object attrs;
7510   Lisp_Object translation_table;
7511   int max_lookup;
7512
7513   attrs = CODING_ID_ATTRS (coding->id);
7514   if (coding->encoder == encode_coding_raw_text)
7515     translation_table = Qnil, max_lookup = 0;
7516   else
7517     translation_table = get_translation_table (attrs, 1, &max_lookup);
7518
7519   if (BUFFERP (coding->dst_object))
7520     {
7521       set_buffer_internal (XBUFFER (coding->dst_object));
7522       coding->dst_multibyte
7523         = ! NILP (current_buffer->enable_multibyte_characters);
7524     }
7525
7526   coding->consumed = coding->consumed_char = 0;
7527   coding->produced = coding->produced_char = 0;
7528   record_conversion_result (coding, CODING_RESULT_SUCCESS);
7529   coding->errors = 0;
7530
7531   ALLOC_CONVERSION_WORK_AREA (coding);
7532
7533   do {
7534     coding_set_source (coding);
7535     consume_chars (coding, translation_table, max_lookup);
7536     coding_set_destination (coding);
7537     (*(coding->encoder)) (coding);
7538   } while (coding->consumed_char < coding->src_chars);
7539
7540   if (BUFFERP (coding->dst_object) && coding->produced_char > 0)
7541     insert_from_gap (coding->produced_char, coding->produced);
7542
7543   return (coding->result);
7544 }
7545
7546
7547 /* Name (or base name) of work buffer for code conversion.  */
7548 static Lisp_Object Vcode_conversion_workbuf_name;
7549
7550 /* A working buffer used by the top level conversion.  Once it is
7551    created, it is never destroyed.  It has the name
7552    Vcode_conversion_workbuf_name.  The other working buffers are
7553    destroyed after the use is finished, and their names are modified
7554    versions of Vcode_conversion_workbuf_name.  */
7555 static Lisp_Object Vcode_conversion_reused_workbuf;
7556
7557 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
7558 static int reused_workbuf_in_use;
7559
7560
7561 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
7562    multibyteness of returning buffer.  */
7563
7564 static Lisp_Object
7565 make_conversion_work_buffer (multibyte)
7566      int multibyte;
7567 {
7568   Lisp_Object name, workbuf;
7569   struct buffer *current;
7570
7571   if (reused_workbuf_in_use++)
7572     {
7573       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
7574       workbuf = Fget_buffer_create (name);
7575     }
7576   else
7577     {
7578       if (NILP (Fbuffer_live_p (Vcode_conversion_reused_workbuf)))
7579         Vcode_conversion_reused_workbuf
7580           = Fget_buffer_create (Vcode_conversion_workbuf_name);
7581       workbuf = Vcode_conversion_reused_workbuf;
7582     }
7583   current = current_buffer;
7584   set_buffer_internal (XBUFFER (workbuf));
7585   /* We can't allow modification hooks to run in the work buffer.  For
7586      instance, directory_files_internal assumes that file decoding
7587      doesn't compile new regexps.  */
7588   Fset (Fmake_local_variable (Qinhibit_modification_hooks), Qt);
7589   Ferase_buffer ();
7590   current_buffer->undo_list = Qt;
7591   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
7592   set_buffer_internal (current);
7593   return workbuf;
7594 }
7595
7596
7597 static Lisp_Object
7598 code_conversion_restore (arg)
7599      Lisp_Object arg;
7600 {
7601   Lisp_Object current, workbuf;
7602   struct gcpro gcpro1;
7603
7604   GCPRO1 (arg);
7605   current = XCAR (arg);
7606   workbuf = XCDR (arg);
7607   if (! NILP (workbuf))
7608     {
7609       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
7610         reused_workbuf_in_use = 0;
7611       else if (! NILP (Fbuffer_live_p (workbuf)))
7612         Fkill_buffer (workbuf);
7613     }
7614   set_buffer_internal (XBUFFER (current));
7615   UNGCPRO;
7616   return Qnil;
7617 }
7618
7619 Lisp_Object
7620 code_conversion_save (with_work_buf, multibyte)
7621      int with_work_buf, multibyte;
7622 {
7623   Lisp_Object workbuf = Qnil;
7624
7625   if (with_work_buf)
7626     workbuf = make_conversion_work_buffer (multibyte);
7627   record_unwind_protect (code_conversion_restore,
7628                          Fcons (Fcurrent_buffer (), workbuf));
7629   return workbuf;
7630 }
7631
7632 int
7633 decode_coding_gap (coding, chars, bytes)
7634      struct coding_system *coding;
7635      EMACS_INT chars, bytes;
7636 {
7637   int count = specpdl_ptr - specpdl;
7638   Lisp_Object attrs;
7639
7640   code_conversion_save (0, 0);
7641
7642   coding->src_object = Fcurrent_buffer ();
7643   coding->src_chars = chars;
7644   coding->src_bytes = bytes;
7645   coding->src_pos = -chars;
7646   coding->src_pos_byte = -bytes;
7647   coding->src_multibyte = chars < bytes;
7648   coding->dst_object = coding->src_object;
7649   coding->dst_pos = PT;
7650   coding->dst_pos_byte = PT_BYTE;
7651   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
7652
7653   if (CODING_REQUIRE_DETECTION (coding))
7654     detect_coding (coding);
7655
7656   coding->mode |= CODING_MODE_LAST_BLOCK;
7657   current_buffer->text->inhibit_shrinking = 1;
7658   decode_coding (coding);
7659   current_buffer->text->inhibit_shrinking = 0;
7660
7661   attrs = CODING_ID_ATTRS (coding->id);
7662   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7663     {
7664       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7665       Lisp_Object val;
7666
7667       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7668       val = call1 (CODING_ATTR_POST_READ (attrs),
7669                    make_number (coding->produced_char));
7670       CHECK_NATNUM (val);
7671       coding->produced_char += Z - prev_Z;
7672       coding->produced += Z_BYTE - prev_Z_BYTE;
7673     }
7674
7675   unbind_to (count, Qnil);
7676   return coding->result;
7677 }
7678
7679 int
7680 encode_coding_gap (coding, chars, bytes)
7681      struct coding_system *coding;
7682      EMACS_INT chars, bytes;
7683 {
7684   int count = specpdl_ptr - specpdl;
7685
7686   code_conversion_save (0, 0);
7687
7688   coding->src_object = Fcurrent_buffer ();
7689   coding->src_chars = chars;
7690   coding->src_bytes = bytes;
7691   coding->src_pos = -chars;
7692   coding->src_pos_byte = -bytes;
7693   coding->src_multibyte = chars < bytes;
7694   coding->dst_object = coding->src_object;
7695   coding->dst_pos = PT;
7696   coding->dst_pos_byte = PT_BYTE;
7697
7698   encode_coding (coding);
7699
7700   unbind_to (count, Qnil);
7701   return coding->result;
7702 }
7703
7704
7705 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
7706    SRC_OBJECT into DST_OBJECT by coding context CODING.
7707
7708    SRC_OBJECT is a buffer, a string, or Qnil.
7709
7710    If it is a buffer, the text is at point of the buffer.  FROM and TO
7711    are positions in the buffer.
7712
7713    If it is a string, the text is at the beginning of the string.
7714    FROM and TO are indices to the string.
7715
7716    If it is nil, the text is at coding->source.  FROM and TO are
7717    indices to coding->source.
7718
7719    DST_OBJECT is a buffer, Qt, or Qnil.
7720
7721    If it is a buffer, the decoded text is inserted at point of the
7722    buffer.  If the buffer is the same as SRC_OBJECT, the source text
7723    is deleted.
7724
7725    If it is Qt, a string is made from the decoded text, and
7726    set in CODING->dst_object.
7727
7728    If it is Qnil, the decoded text is stored at CODING->destination.
7729    The caller must allocate CODING->dst_bytes bytes at
7730    CODING->destination by xmalloc.  If the decoded text is longer than
7731    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
7732  */
7733
7734 void
7735 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7736                       dst_object)
7737      struct coding_system *coding;
7738      Lisp_Object src_object;
7739      EMACS_INT from, from_byte, to, to_byte;
7740      Lisp_Object dst_object;
7741 {
7742   int count = specpdl_ptr - specpdl;
7743   unsigned char *destination;
7744   EMACS_INT dst_bytes;
7745   EMACS_INT chars = to - from;
7746   EMACS_INT bytes = to_byte - from_byte;
7747   Lisp_Object attrs;
7748   int saved_pt = -1, saved_pt_byte;
7749   int need_marker_adjustment = 0;
7750   Lisp_Object old_deactivate_mark;
7751
7752   old_deactivate_mark = Vdeactivate_mark;
7753
7754   if (NILP (dst_object))
7755     {
7756       destination = coding->destination;
7757       dst_bytes = coding->dst_bytes;
7758     }
7759
7760   coding->src_object = src_object;
7761   coding->src_chars = chars;
7762   coding->src_bytes = bytes;
7763   coding->src_multibyte = chars < bytes;
7764
7765   if (STRINGP (src_object))
7766     {
7767       coding->src_pos = from;
7768       coding->src_pos_byte = from_byte;
7769     }
7770   else if (BUFFERP (src_object))
7771     {
7772       set_buffer_internal (XBUFFER (src_object));
7773       if (from != GPT)
7774         move_gap_both (from, from_byte);
7775       if (EQ (src_object, dst_object))
7776         {
7777           struct Lisp_Marker *tail;
7778
7779           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7780             {
7781               tail->need_adjustment
7782                 = tail->charpos == (tail->insertion_type ? from : to);
7783               need_marker_adjustment |= tail->need_adjustment;
7784             }
7785           saved_pt = PT, saved_pt_byte = PT_BYTE;
7786           TEMP_SET_PT_BOTH (from, from_byte);
7787           current_buffer->text->inhibit_shrinking = 1;
7788           del_range_both (from, from_byte, to, to_byte, 1);
7789           coding->src_pos = -chars;
7790           coding->src_pos_byte = -bytes;
7791         }
7792       else
7793         {
7794           coding->src_pos = from;
7795           coding->src_pos_byte = from_byte;
7796         }
7797     }
7798
7799   if (CODING_REQUIRE_DETECTION (coding))
7800     detect_coding (coding);
7801   attrs = CODING_ID_ATTRS (coding->id);
7802
7803   if (EQ (dst_object, Qt)
7804       || (! NILP (CODING_ATTR_POST_READ (attrs))
7805           && NILP (dst_object)))
7806     {
7807       coding->dst_multibyte = !CODING_FOR_UNIBYTE (coding);
7808       coding->dst_object = code_conversion_save (1, coding->dst_multibyte);
7809       coding->dst_pos = BEG;
7810       coding->dst_pos_byte = BEG_BYTE;
7811     }
7812   else if (BUFFERP (dst_object))
7813     {
7814       code_conversion_save (0, 0);
7815       coding->dst_object = dst_object;
7816       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7817       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7818       coding->dst_multibyte
7819         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7820     }
7821   else
7822     {
7823       code_conversion_save (0, 0);
7824       coding->dst_object = Qnil;
7825       /* Most callers presume this will return a multibyte result, and they
7826          won't use `binary' or `raw-text' anyway, so let's not worry about
7827          CODING_FOR_UNIBYTE.  */
7828       coding->dst_multibyte = 1;
7829     }
7830
7831   decode_coding (coding);
7832
7833   if (BUFFERP (coding->dst_object))
7834     set_buffer_internal (XBUFFER (coding->dst_object));
7835
7836   if (! NILP (CODING_ATTR_POST_READ (attrs)))
7837     {
7838       struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7839       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
7840       Lisp_Object val;
7841
7842       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
7843       GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7844               old_deactivate_mark);
7845       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
7846                         make_number (coding->produced_char));
7847       UNGCPRO;
7848       CHECK_NATNUM (val);
7849       coding->produced_char += Z - prev_Z;
7850       coding->produced += Z_BYTE - prev_Z_BYTE;
7851     }
7852
7853   if (EQ (dst_object, Qt))
7854     {
7855       coding->dst_object = Fbuffer_string ();
7856     }
7857   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
7858     {
7859       set_buffer_internal (XBUFFER (coding->dst_object));
7860       if (dst_bytes < coding->produced)
7861         {
7862           destination = xrealloc (destination, coding->produced);
7863           if (! destination)
7864             {
7865               record_conversion_result (coding,
7866                                         CODING_RESULT_INSUFFICIENT_DST);
7867               unbind_to (count, Qnil);
7868               return;
7869             }
7870           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
7871             move_gap_both (BEGV, BEGV_BYTE);
7872           bcopy (BEGV_ADDR, destination, coding->produced);
7873           coding->destination = destination;
7874         }
7875     }
7876
7877   if (saved_pt >= 0)
7878     {
7879       /* This is the case of:
7880          (BUFFERP (src_object) && EQ (src_object, dst_object))
7881          As we have moved PT while replacing the original buffer
7882          contents, we must recover it now.  */
7883       set_buffer_internal (XBUFFER (src_object));
7884       current_buffer->text->inhibit_shrinking = 0;
7885       if (saved_pt < from)
7886         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7887       else if (saved_pt < from + chars)
7888         TEMP_SET_PT_BOTH (from, from_byte);
7889       else if (! NILP (current_buffer->enable_multibyte_characters))
7890         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7891                           saved_pt_byte + (coding->produced - bytes));
7892       else
7893         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7894                           saved_pt_byte + (coding->produced - bytes));
7895
7896       if (need_marker_adjustment)
7897         {
7898           struct Lisp_Marker *tail;
7899
7900           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7901             if (tail->need_adjustment)
7902               {
7903                 tail->need_adjustment = 0;
7904                 if (tail->insertion_type)
7905                   {
7906                     tail->bytepos = from_byte;
7907                     tail->charpos = from;
7908                   }
7909                 else
7910                   {
7911                     tail->bytepos = from_byte + coding->produced;
7912                     tail->charpos
7913                       = (NILP (current_buffer->enable_multibyte_characters)
7914                          ? tail->bytepos : from + coding->produced_char);
7915                   }
7916               }
7917         }
7918     }
7919
7920   Vdeactivate_mark = old_deactivate_mark;
7921   unbind_to (count, coding->dst_object);
7922 }
7923
7924
7925 void
7926 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
7927                       dst_object)
7928      struct coding_system *coding;
7929      Lisp_Object src_object;
7930      EMACS_INT from, from_byte, to, to_byte;
7931      Lisp_Object dst_object;
7932 {
7933   int count = specpdl_ptr - specpdl;
7934   EMACS_INT chars = to - from;
7935   EMACS_INT bytes = to_byte - from_byte;
7936   Lisp_Object attrs;
7937   int saved_pt = -1, saved_pt_byte;
7938   int need_marker_adjustment = 0;
7939   int kill_src_buffer = 0;
7940   Lisp_Object old_deactivate_mark;
7941
7942   old_deactivate_mark = Vdeactivate_mark;
7943
7944   coding->src_object = src_object;
7945   coding->src_chars = chars;
7946   coding->src_bytes = bytes;
7947   coding->src_multibyte = chars < bytes;
7948
7949   attrs = CODING_ID_ATTRS (coding->id);
7950
7951   if (EQ (src_object, dst_object))
7952     {
7953       struct Lisp_Marker *tail;
7954
7955       for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
7956         {
7957           tail->need_adjustment
7958             = tail->charpos == (tail->insertion_type ? from : to);
7959           need_marker_adjustment |= tail->need_adjustment;
7960         }
7961     }
7962
7963   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
7964     {
7965       coding->src_object = code_conversion_save (1, coding->src_multibyte);
7966       set_buffer_internal (XBUFFER (coding->src_object));
7967       if (STRINGP (src_object))
7968         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
7969       else if (BUFFERP (src_object))
7970         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
7971       else
7972         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
7973
7974       if (EQ (src_object, dst_object))
7975         {
7976           set_buffer_internal (XBUFFER (src_object));
7977           saved_pt = PT, saved_pt_byte = PT_BYTE;
7978           del_range_both (from, from_byte, to, to_byte, 1);
7979           set_buffer_internal (XBUFFER (coding->src_object));
7980         }
7981
7982       {
7983         Lisp_Object args[3];
7984         struct gcpro gcpro1, gcpro2, gcpro3, gcpro4, gcpro5;
7985
7986         GCPRO5 (coding->src_object, coding->dst_object, src_object, dst_object,
7987                 old_deactivate_mark);
7988         args[0] = CODING_ATTR_PRE_WRITE (attrs);
7989         args[1] = make_number (BEG);
7990         args[2] = make_number (Z);
7991         safe_call (3, args);
7992         UNGCPRO;
7993       }
7994       if (XBUFFER (coding->src_object) != current_buffer)
7995         kill_src_buffer = 1;
7996       coding->src_object = Fcurrent_buffer ();
7997       if (BEG != GPT)
7998         move_gap_both (BEG, BEG_BYTE);
7999       coding->src_chars = Z - BEG;
8000       coding->src_bytes = Z_BYTE - BEG_BYTE;
8001       coding->src_pos = BEG;
8002       coding->src_pos_byte = BEG_BYTE;
8003       coding->src_multibyte = Z < Z_BYTE;
8004     }
8005   else if (STRINGP (src_object))
8006     {
8007       code_conversion_save (0, 0);
8008       coding->src_pos = from;
8009       coding->src_pos_byte = from_byte;
8010     }
8011   else if (BUFFERP (src_object))
8012     {
8013       code_conversion_save (0, 0);
8014       set_buffer_internal (XBUFFER (src_object));
8015       if (EQ (src_object, dst_object))
8016         {
8017           saved_pt = PT, saved_pt_byte = PT_BYTE;
8018           coding->src_object = del_range_1 (from, to, 1, 1);
8019           coding->src_pos = 0;
8020           coding->src_pos_byte = 0;
8021         }
8022       else
8023         {
8024           if (from < GPT && to >= GPT)
8025             move_gap_both (from, from_byte);
8026           coding->src_pos = from;
8027           coding->src_pos_byte = from_byte;
8028         }
8029     }
8030   else
8031     code_conversion_save (0, 0);
8032
8033   if (BUFFERP (dst_object))
8034     {
8035       coding->dst_object = dst_object;
8036       if (EQ (src_object, dst_object))
8037         {
8038           coding->dst_pos = from;
8039           coding->dst_pos_byte = from_byte;
8040         }
8041       else
8042         {
8043           struct buffer *current = current_buffer;
8044
8045           set_buffer_temp (XBUFFER (dst_object));
8046           coding->dst_pos = PT;
8047           coding->dst_pos_byte = PT_BYTE;
8048           move_gap_both (coding->dst_pos, coding->dst_pos_byte);
8049           set_buffer_temp (current);
8050         }
8051       coding->dst_multibyte
8052         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
8053     }
8054   else if (EQ (dst_object, Qt))
8055     {
8056       coding->dst_object = Qnil;
8057       coding->dst_bytes = coding->src_chars;
8058       if (coding->dst_bytes == 0)
8059         coding->dst_bytes = 1;
8060       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
8061       coding->dst_multibyte = 0;
8062     }
8063   else
8064     {
8065       coding->dst_object = Qnil;
8066       coding->dst_multibyte = 0;
8067     }
8068
8069   encode_coding (coding);
8070
8071   if (EQ (dst_object, Qt))
8072     {
8073       if (BUFFERP (coding->dst_object))
8074         coding->dst_object = Fbuffer_string ();
8075       else
8076         {
8077           coding->dst_object
8078             = make_unibyte_string ((char *) coding->destination,
8079                                    coding->produced);
8080           xfree (coding->destination);
8081         }
8082     }
8083
8084   if (saved_pt >= 0)
8085     {
8086       /* This is the case of:
8087          (BUFFERP (src_object) && EQ (src_object, dst_object))
8088          As we have moved PT while replacing the original buffer
8089          contents, we must recover it now.  */
8090       set_buffer_internal (XBUFFER (src_object));
8091       if (saved_pt < from)
8092         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
8093       else if (saved_pt < from + chars)
8094         TEMP_SET_PT_BOTH (from, from_byte);
8095       else if (! NILP (current_buffer->enable_multibyte_characters))
8096         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
8097                           saved_pt_byte + (coding->produced - bytes));
8098       else
8099         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
8100                           saved_pt_byte + (coding->produced - bytes));
8101
8102       if (need_marker_adjustment)
8103         {
8104           struct Lisp_Marker *tail;
8105
8106           for (tail = BUF_MARKERS (current_buffer); tail; tail = tail->next)
8107             if (tail->need_adjustment)
8108               {
8109                 tail->need_adjustment = 0;
8110                 if (tail->insertion_type)
8111                   {
8112                     tail->bytepos = from_byte;
8113                     tail->charpos = from;
8114                   }
8115                 else
8116                   {
8117                     tail->bytepos = from_byte + coding->produced;
8118                     tail->charpos
8119                       = (NILP (current_buffer->enable_multibyte_characters)
8120                          ? tail->bytepos : from + coding->produced_char);
8121                   }
8122               }
8123         }
8124     }
8125
8126   if (kill_src_buffer)
8127     Fkill_buffer (coding->src_object);
8128
8129   Vdeactivate_mark = old_deactivate_mark;
8130   unbind_to (count, Qnil);
8131 }
8132
8133
8134 Lisp_Object
8135 preferred_coding_system ()
8136 {
8137   int id = coding_categories[coding_priorities[0]].id;
8138
8139   return CODING_ID_NAME (id);
8140 }
8141
8142 \f
8143 #ifdef emacs
8144 /*** 8. Emacs Lisp library functions ***/
8145
8146 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
8147        doc: /* Return t if OBJECT is nil or a coding-system.
8148 See the documentation of `define-coding-system' for information
8149 about coding-system objects.  */)
8150      (object)
8151      Lisp_Object object;
8152 {
8153   if (NILP (object)
8154       || CODING_SYSTEM_ID (object) >= 0)
8155     return Qt;
8156   if (! SYMBOLP (object)
8157       || NILP (Fget (object, Qcoding_system_define_form)))
8158     return Qnil;
8159   return Qt;
8160 }
8161
8162 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
8163        Sread_non_nil_coding_system, 1, 1, 0,
8164        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
8165      (prompt)
8166      Lisp_Object prompt;
8167 {
8168   Lisp_Object val;
8169   do
8170     {
8171       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8172                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
8173     }
8174   while (SCHARS (val) == 0);
8175   return (Fintern (val, Qnil));
8176 }
8177
8178 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
8179        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
8180 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.
8181 Ignores case when completing coding systems (all Emacs coding systems
8182 are lower-case).  */)
8183      (prompt, default_coding_system)
8184      Lisp_Object prompt, default_coding_system;
8185 {
8186   Lisp_Object val;
8187   int count = SPECPDL_INDEX ();
8188
8189   if (SYMBOLP (default_coding_system))
8190     default_coding_system = SYMBOL_NAME (default_coding_system);
8191   specbind (Qcompletion_ignore_case, Qt);
8192   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
8193                           Qt, Qnil, Qcoding_system_history,
8194                           default_coding_system, Qnil);
8195   unbind_to (count, Qnil);
8196   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
8197 }
8198
8199 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
8200        1, 1, 0,
8201        doc: /* Check validity of CODING-SYSTEM.
8202 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
8203 It is valid if it is nil or a symbol defined as a coding system by the
8204 function `define-coding-system'.  */)
8205   (coding_system)
8206      Lisp_Object coding_system;
8207 {
8208   Lisp_Object define_form;
8209
8210   define_form = Fget (coding_system, Qcoding_system_define_form);
8211   if (! NILP (define_form))
8212     {
8213       Fput (coding_system, Qcoding_system_define_form, Qnil);
8214       safe_eval (define_form);
8215     }
8216   if (!NILP (Fcoding_system_p (coding_system)))
8217     return coding_system;
8218   xsignal1 (Qcoding_system_error, coding_system);
8219 }
8220
8221 \f
8222 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
8223    HIGHEST is nonzero, return the coding system of the highest
8224    priority among the detected coding systems.  Otherwize return a
8225    list of detected coding systems sorted by their priorities.  If
8226    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
8227    multibyte form but contains only ASCII and eight-bit chars.
8228    Otherwise, the bytes are raw bytes.
8229
8230    CODING-SYSTEM controls the detection as below:
8231
8232    If it is nil, detect both text-format and eol-format.  If the
8233    text-format part of CODING-SYSTEM is already specified
8234    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
8235    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
8236    detect only text-format.  */
8237
8238 Lisp_Object
8239 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
8240                       coding_system)
8241      const unsigned char *src;
8242      EMACS_INT src_chars, src_bytes;
8243      int highest;
8244      int multibytep;
8245      Lisp_Object coding_system;
8246 {
8247   const unsigned char *src_end = src + src_bytes;
8248   Lisp_Object attrs, eol_type;
8249   Lisp_Object val = Qnil;
8250   struct coding_system coding;
8251   int id;
8252   struct coding_detection_info detect_info;
8253   enum coding_category base_category;
8254   int null_byte_found = 0, eight_bit_found = 0;
8255
8256   if (NILP (coding_system))
8257     coding_system = Qundecided;
8258   setup_coding_system (coding_system, &coding);
8259   attrs = CODING_ID_ATTRS (coding.id);
8260   eol_type = CODING_ID_EOL_TYPE (coding.id);
8261   coding_system = CODING_ATTR_BASE_NAME (attrs);
8262
8263   coding.source = src;
8264   coding.src_chars = src_chars;
8265   coding.src_bytes = src_bytes;
8266   coding.src_multibyte = multibytep;
8267   coding.consumed = 0;
8268   coding.mode |= CODING_MODE_LAST_BLOCK;
8269   coding.head_ascii = 0;
8270
8271   detect_info.checked = detect_info.found = detect_info.rejected = 0;
8272
8273   /* At first, detect text-format if necessary.  */
8274   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
8275   if (base_category == coding_category_undecided)
8276     {
8277       enum coding_category category;
8278       struct coding_system *this;
8279       int c, i;
8280
8281       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
8282       for (; src < src_end; src++)
8283         {
8284           c = *src;
8285           if (c & 0x80)
8286             {
8287               eight_bit_found = 1;
8288               if (null_byte_found)
8289                 break;
8290             }
8291           else if (c < 0x20)
8292             {
8293               if ((c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
8294                   && ! inhibit_iso_escape_detection
8295                   && ! detect_info.checked)
8296                 {
8297                   if (detect_coding_iso_2022 (&coding, &detect_info))
8298                     {
8299                       /* We have scanned the whole data.  */
8300                       if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
8301                         {
8302                           /* We didn't find an 8-bit code.  We may
8303                              have found a null-byte, but it's very
8304                              rare that a binary file confirm to
8305                              ISO-2022.  */
8306                           src = src_end;
8307                           coding.head_ascii = src - coding.source;
8308                         }
8309                       detect_info.rejected |= ~CATEGORY_MASK_ISO_ESCAPE;
8310                       break;
8311                     }
8312                 }
8313               else if (! c && !inhibit_null_byte_detection)
8314                 {
8315                   null_byte_found = 1;
8316                   if (eight_bit_found)
8317                     break;
8318                 }
8319               if (! eight_bit_found)
8320                 coding.head_ascii++;
8321             }
8322           else if (! eight_bit_found)
8323             coding.head_ascii++;
8324         }
8325
8326       if (null_byte_found || eight_bit_found
8327           || coding.head_ascii < coding.src_bytes
8328           || detect_info.found)
8329         {
8330           if (coding.head_ascii == coding.src_bytes)
8331             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
8332             for (i = 0; i < coding_category_raw_text; i++)
8333               {
8334                 category = coding_priorities[i];
8335                 this = coding_categories + category;
8336                 if (detect_info.found & (1 << category))
8337                   break;
8338               }
8339           else
8340             {
8341               if (null_byte_found)
8342                 {
8343                   detect_info.checked |= ~CATEGORY_MASK_UTF_16;
8344                   detect_info.rejected |= ~CATEGORY_MASK_UTF_16;
8345                 }
8346               for (i = 0; i < coding_category_raw_text; i++)
8347                 {
8348                   category = coding_priorities[i];
8349                   this = coding_categories + category;
8350
8351                   if (this->id < 0)
8352                     {
8353                       /* No coding system of this category is defined.  */
8354                       detect_info.rejected |= (1 << category);
8355                     }
8356                   else if (category >= coding_category_raw_text)
8357                     continue;
8358                   else if (detect_info.checked & (1 << category))
8359                     {
8360                       if (highest
8361                           && (detect_info.found & (1 << category)))
8362                         break;
8363                     }
8364                   else if ((*(this->detector)) (&coding, &detect_info)
8365                            && highest
8366                            && (detect_info.found & (1 << category)))
8367                     {
8368                       if (category == coding_category_utf_16_auto)
8369                         {
8370                           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8371                             category = coding_category_utf_16_le;
8372                           else
8373                             category = coding_category_utf_16_be;
8374                         }
8375                       break;
8376                     }
8377                 }
8378             }
8379         }
8380
8381       if ((detect_info.rejected & CATEGORY_MASK_ANY) == CATEGORY_MASK_ANY
8382           || null_byte_found)
8383         {
8384           detect_info.found = CATEGORY_MASK_RAW_TEXT;
8385           id = CODING_SYSTEM_ID (Qno_conversion);
8386           val = Fcons (make_number (id), Qnil);
8387         }
8388       else if (! detect_info.rejected && ! detect_info.found)
8389         {
8390           detect_info.found = CATEGORY_MASK_ANY;
8391           id = coding_categories[coding_category_undecided].id;
8392           val = Fcons (make_number (id), Qnil);
8393         }
8394       else if (highest)
8395         {
8396           if (detect_info.found)
8397             {
8398               detect_info.found = 1 << category;
8399               val = Fcons (make_number (this->id), Qnil);
8400             }
8401           else
8402             for (i = 0; i < coding_category_raw_text; i++)
8403               if (! (detect_info.rejected & (1 << coding_priorities[i])))
8404                 {
8405                   detect_info.found = 1 << coding_priorities[i];
8406                   id = coding_categories[coding_priorities[i]].id;
8407                   val = Fcons (make_number (id), Qnil);
8408                   break;
8409                 }
8410         }
8411       else
8412         {
8413           int mask = detect_info.rejected | detect_info.found;
8414           int found = 0;
8415
8416           for (i = coding_category_raw_text - 1; i >= 0; i--)
8417             {
8418               category = coding_priorities[i];
8419               if (! (mask & (1 << category)))
8420                 {
8421                   found |= 1 << category;
8422                   id = coding_categories[category].id;
8423                   if (id >= 0)
8424                     val = Fcons (make_number (id), val);
8425                 }
8426             }
8427           for (i = coding_category_raw_text - 1; i >= 0; i--)
8428             {
8429               category = coding_priorities[i];
8430               if (detect_info.found & (1 << category))
8431                 {
8432                   id = coding_categories[category].id;
8433                   val = Fcons (make_number (id), val);
8434                 }
8435             }
8436           detect_info.found |= found;
8437         }
8438     }
8439   else if (base_category == coding_category_utf_8_auto)
8440     {
8441       if (detect_coding_utf_8 (&coding, &detect_info))
8442         {
8443           struct coding_system *this;
8444
8445           if (detect_info.found & CATEGORY_MASK_UTF_8_SIG)
8446             this = coding_categories + coding_category_utf_8_sig;
8447           else
8448             this = coding_categories + coding_category_utf_8_nosig;
8449           val = Fcons (make_number (this->id), Qnil);
8450         }
8451     }
8452   else if (base_category == coding_category_utf_16_auto)
8453     {
8454       if (detect_coding_utf_16 (&coding, &detect_info))
8455         {
8456           struct coding_system *this;
8457
8458           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
8459             this = coding_categories + coding_category_utf_16_le;
8460           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
8461             this = coding_categories + coding_category_utf_16_be;
8462           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
8463             this = coding_categories + coding_category_utf_16_be_nosig;
8464           else
8465             this = coding_categories + coding_category_utf_16_le_nosig;
8466           val = Fcons (make_number (this->id), Qnil);
8467         }
8468     }
8469   else
8470     {
8471       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
8472       val = Fcons (make_number (coding.id), Qnil);
8473     }
8474
8475   /* Then, detect eol-format if necessary.  */
8476   {
8477     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol = -1;
8478     Lisp_Object tail;
8479
8480     if (VECTORP (eol_type))
8481       {
8482         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
8483           {
8484             if (null_byte_found)
8485               normal_eol = EOL_SEEN_LF;
8486             else
8487               normal_eol = detect_eol (coding.source, src_bytes,
8488                                        coding_category_raw_text);
8489           }
8490         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
8491                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
8492           utf_16_be_eol = detect_eol (coding.source, src_bytes,
8493                                       coding_category_utf_16_be);
8494         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
8495                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
8496           utf_16_le_eol = detect_eol (coding.source, src_bytes,
8497                                       coding_category_utf_16_le);
8498       }
8499     else
8500       {
8501         if (EQ (eol_type, Qunix))
8502           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
8503         else if (EQ (eol_type, Qdos))
8504           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
8505         else
8506           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
8507       }
8508
8509     for (tail = val; CONSP (tail); tail = XCDR (tail))
8510       {
8511         enum coding_category category;
8512         int this_eol;
8513
8514         id = XINT (XCAR (tail));
8515         attrs = CODING_ID_ATTRS (id);
8516         category = XINT (CODING_ATTR_CATEGORY (attrs));
8517         eol_type = CODING_ID_EOL_TYPE (id);
8518         if (VECTORP (eol_type))
8519           {
8520             if (category == coding_category_utf_16_be
8521                 || category == coding_category_utf_16_be_nosig)
8522               this_eol = utf_16_be_eol;
8523             else if (category == coding_category_utf_16_le
8524                      || category == coding_category_utf_16_le_nosig)
8525               this_eol = utf_16_le_eol;
8526             else
8527               this_eol = normal_eol;
8528
8529             if (this_eol == EOL_SEEN_LF)
8530               XSETCAR (tail, AREF (eol_type, 0));
8531             else if (this_eol == EOL_SEEN_CRLF)
8532               XSETCAR (tail, AREF (eol_type, 1));
8533             else if (this_eol == EOL_SEEN_CR)
8534               XSETCAR (tail, AREF (eol_type, 2));
8535             else
8536               XSETCAR (tail, CODING_ID_NAME (id));
8537           }
8538         else
8539           XSETCAR (tail, CODING_ID_NAME (id));
8540       }
8541   }
8542
8543   return (highest ? (CONSP (val) ? XCAR (val) : Qnil) : val);
8544 }
8545
8546
8547 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
8548        2, 3, 0,
8549        doc: /* Detect coding system of the text in the region between START and END.
8550 Return a list of possible coding systems ordered by priority.
8551 The coding systems to try and their priorities follows what
8552 the function `coding-system-priority-list' (which see) returns.
8553
8554 If only ASCII characters are found (except for such ISO-2022 control
8555 characters as ESC), it returns a list of single element `undecided'
8556 or its subsidiary coding system according to a detected end-of-line
8557 format.
8558
8559 If optional argument HIGHEST is non-nil, return the coding system of
8560 highest priority.  */)
8561      (start, end, highest)
8562      Lisp_Object start, end, highest;
8563 {
8564   int from, to;
8565   int from_byte, to_byte;
8566
8567   CHECK_NUMBER_COERCE_MARKER (start);
8568   CHECK_NUMBER_COERCE_MARKER (end);
8569
8570   validate_region (&start, &end);
8571   from = XINT (start), to = XINT (end);
8572   from_byte = CHAR_TO_BYTE (from);
8573   to_byte = CHAR_TO_BYTE (to);
8574
8575   if (from < GPT && to >= GPT)
8576     move_gap_both (to, to_byte);
8577
8578   return detect_coding_system (BYTE_POS_ADDR (from_byte),
8579                                to - from, to_byte - from_byte,
8580                                !NILP (highest),
8581                                !NILP (current_buffer
8582                                       ->enable_multibyte_characters),
8583                                Qnil);
8584 }
8585
8586 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
8587        1, 2, 0,
8588        doc: /* Detect coding system of the text in STRING.
8589 Return a list of possible coding systems ordered by priority.
8590 The coding systems to try and their priorities follows what
8591 the function `coding-system-priority-list' (which see) returns.
8592
8593 If only ASCII characters are found (except for such ISO-2022 control
8594 characters as ESC), it returns a list of single element `undecided'
8595 or its subsidiary coding system according to a detected end-of-line
8596 format.
8597
8598 If optional argument HIGHEST is non-nil, return the coding system of
8599 highest priority.  */)
8600      (string, highest)
8601      Lisp_Object string, highest;
8602 {
8603   CHECK_STRING (string);
8604
8605   return detect_coding_system (SDATA (string),
8606                                SCHARS (string), SBYTES (string),
8607                                !NILP (highest), STRING_MULTIBYTE (string),
8608                                Qnil);
8609 }
8610
8611
8612 static INLINE int
8613 char_encodable_p (c, attrs)
8614      int c;
8615      Lisp_Object attrs;
8616 {
8617   Lisp_Object tail;
8618   struct charset *charset;
8619   Lisp_Object translation_table;
8620
8621   translation_table = CODING_ATTR_TRANS_TBL (attrs);
8622   if (! NILP (translation_table))
8623     c = translate_char (translation_table, c);
8624   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
8625        CONSP (tail); tail = XCDR (tail))
8626     {
8627       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
8628       if (CHAR_CHARSET_P (c, charset))
8629         break;
8630     }
8631   return (! NILP (tail));
8632 }
8633
8634
8635 /* Return a list of coding systems that safely encode the text between
8636    START and END.  If EXCLUDE is non-nil, it is a list of coding
8637    systems not to check.  The returned list doesn't contain any such
8638    coding systems.  In any case, if the text contains only ASCII or is
8639    unibyte, return t.  */
8640
8641 DEFUN ("find-coding-systems-region-internal",
8642        Ffind_coding_systems_region_internal,
8643        Sfind_coding_systems_region_internal, 2, 3, 0,
8644        doc: /* Internal use only.  */)
8645      (start, end, exclude)
8646      Lisp_Object start, end, exclude;
8647 {
8648   Lisp_Object coding_attrs_list, safe_codings;
8649   EMACS_INT start_byte, end_byte;
8650   const unsigned char *p, *pbeg, *pend;
8651   int c;
8652   Lisp_Object tail, elt, work_table;
8653
8654   if (STRINGP (start))
8655     {
8656       if (!STRING_MULTIBYTE (start)
8657           || SCHARS (start) == SBYTES (start))
8658         return Qt;
8659       start_byte = 0;
8660       end_byte = SBYTES (start);
8661     }
8662   else
8663     {
8664       CHECK_NUMBER_COERCE_MARKER (start);
8665       CHECK_NUMBER_COERCE_MARKER (end);
8666       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8667         args_out_of_range (start, end);
8668       if (NILP (current_buffer->enable_multibyte_characters))
8669         return Qt;
8670       start_byte = CHAR_TO_BYTE (XINT (start));
8671       end_byte = CHAR_TO_BYTE (XINT (end));
8672       if (XINT (end) - XINT (start) == end_byte - start_byte)
8673         return Qt;
8674
8675       if (XINT (start) < GPT && XINT (end) > GPT)
8676         {
8677           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8678             move_gap_both (XINT (start), start_byte);
8679           else
8680             move_gap_both (XINT (end), end_byte);
8681         }
8682     }
8683
8684   coding_attrs_list = Qnil;
8685   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
8686     if (NILP (exclude)
8687         || NILP (Fmemq (XCAR (tail), exclude)))
8688       {
8689         Lisp_Object attrs;
8690
8691         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
8692         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
8693             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
8694           {
8695             ASET (attrs, coding_attr_trans_tbl,
8696                   get_translation_table (attrs, 1, NULL));
8697             coding_attrs_list = Fcons (attrs, coding_attrs_list);
8698           }
8699       }
8700
8701   if (STRINGP (start))
8702     p = pbeg = SDATA (start);
8703   else
8704     p = pbeg = BYTE_POS_ADDR (start_byte);
8705   pend = p + (end_byte - start_byte);
8706
8707   while (p < pend && ASCII_BYTE_P (*p)) p++;
8708   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8709
8710   work_table = Fmake_char_table (Qnil, Qnil);
8711   while (p < pend)
8712     {
8713       if (ASCII_BYTE_P (*p))
8714         p++;
8715       else
8716         {
8717           c = STRING_CHAR_ADVANCE (p);
8718           if (!NILP (char_table_ref (work_table, c)))
8719             /* This character was already checked.  Ignore it.  */
8720             continue;
8721
8722           charset_map_loaded = 0;
8723           for (tail = coding_attrs_list; CONSP (tail);)
8724             {
8725               elt = XCAR (tail);
8726               if (NILP (elt))
8727                 tail = XCDR (tail);
8728               else if (char_encodable_p (c, elt))
8729                 tail = XCDR (tail);
8730               else if (CONSP (XCDR (tail)))
8731                 {
8732                   XSETCAR (tail, XCAR (XCDR (tail)));
8733                   XSETCDR (tail, XCDR (XCDR (tail)));
8734                 }
8735               else
8736                 {
8737                   XSETCAR (tail, Qnil);
8738                   tail = XCDR (tail);
8739                 }
8740             }
8741           if (charset_map_loaded)
8742             {
8743               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8744
8745               if (STRINGP (start))
8746                 pbeg = SDATA (start);
8747               else
8748                 pbeg = BYTE_POS_ADDR (start_byte);
8749               p = pbeg + p_offset;
8750               pend = pbeg + pend_offset;
8751             }
8752           char_table_set (work_table, c, Qt);
8753         }
8754     }
8755
8756   safe_codings = list2 (Qraw_text, Qno_conversion);
8757   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
8758     if (! NILP (XCAR (tail)))
8759       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
8760
8761   return safe_codings;
8762 }
8763
8764
8765 DEFUN ("unencodable-char-position", Funencodable_char_position,
8766        Sunencodable_char_position, 3, 5, 0,
8767        doc: /*
8768 Return position of first un-encodable character in a region.
8769 START and END specify the region and CODING-SYSTEM specifies the
8770 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
8771
8772 If optional 4th argument COUNT is non-nil, it specifies at most how
8773 many un-encodable characters to search.  In this case, the value is a
8774 list of positions.
8775
8776 If optional 5th argument STRING is non-nil, it is a string to search
8777 for un-encodable characters.  In that case, START and END are indexes
8778 to the string.  */)
8779      (start, end, coding_system, count, string)
8780      Lisp_Object start, end, coding_system, count, string;
8781 {
8782   int n;
8783   struct coding_system coding;
8784   Lisp_Object attrs, charset_list, translation_table;
8785   Lisp_Object positions;
8786   int from, to;
8787   const unsigned char *p, *stop, *pend;
8788   int ascii_compatible;
8789
8790   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
8791   attrs = CODING_ID_ATTRS (coding.id);
8792   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
8793     return Qnil;
8794   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
8795   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8796   translation_table = get_translation_table (attrs, 1, NULL);
8797
8798   if (NILP (string))
8799     {
8800       validate_region (&start, &end);
8801       from = XINT (start);
8802       to = XINT (end);
8803       if (NILP (current_buffer->enable_multibyte_characters)
8804           || (ascii_compatible
8805               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
8806         return Qnil;
8807       p = CHAR_POS_ADDR (from);
8808       pend = CHAR_POS_ADDR (to);
8809       if (from < GPT && to >= GPT)
8810         stop = GPT_ADDR;
8811       else
8812         stop = pend;
8813     }
8814   else
8815     {
8816       CHECK_STRING (string);
8817       CHECK_NATNUM (start);
8818       CHECK_NATNUM (end);
8819       from = XINT (start);
8820       to = XINT (end);
8821       if (from > to
8822           || to > SCHARS (string))
8823         args_out_of_range_3 (string, start, end);
8824       if (! STRING_MULTIBYTE (string))
8825         return Qnil;
8826       p = SDATA (string) + string_char_to_byte (string, from);
8827       stop = pend = SDATA (string) + string_char_to_byte (string, to);
8828       if (ascii_compatible && (to - from) == (pend - p))
8829         return Qnil;
8830     }
8831
8832   if (NILP (count))
8833     n = 1;
8834   else
8835     {
8836       CHECK_NATNUM (count);
8837       n = XINT (count);
8838     }
8839
8840   positions = Qnil;
8841   while (1)
8842     {
8843       int c;
8844
8845       if (ascii_compatible)
8846         while (p < stop && ASCII_BYTE_P (*p))
8847           p++, from++;
8848       if (p >= stop)
8849         {
8850           if (p >= pend)
8851             break;
8852           stop = pend;
8853           p = GAP_END_ADDR;
8854         }
8855
8856       c = STRING_CHAR_ADVANCE (p);
8857       if (! (ASCII_CHAR_P (c) && ascii_compatible)
8858           && ! char_charset (translate_char (translation_table, c),
8859                              charset_list, NULL))
8860         {
8861           positions = Fcons (make_number (from), positions);
8862           n--;
8863           if (n == 0)
8864             break;
8865         }
8866
8867       from++;
8868     }
8869
8870   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
8871 }
8872
8873
8874 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
8875        Scheck_coding_systems_region, 3, 3, 0,
8876        doc: /* Check if the region is encodable by coding systems.
8877
8878 START and END are buffer positions specifying the region.
8879 CODING-SYSTEM-LIST is a list of coding systems to check.
8880
8881 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
8882 CODING-SYSTEM is a member of CODING-SYSTEM-LIST and can't encode the
8883 whole region, POS0, POS1, ... are buffer positions where non-encodable
8884 characters are found.
8885
8886 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
8887 value is nil.
8888
8889 START may be a string.  In that case, check if the string is
8890 encodable, and the value contains indices to the string instead of
8891 buffer positions.  END is ignored.
8892
8893 If the current buffer (or START if it is a string) is unibyte, the value
8894 is nil.  */)
8895      (start, end, coding_system_list)
8896      Lisp_Object start, end, coding_system_list;
8897 {
8898   Lisp_Object list;
8899   EMACS_INT start_byte, end_byte;
8900   int pos;
8901   const unsigned char *p, *pbeg, *pend;
8902   int c;
8903   Lisp_Object tail, elt, attrs;
8904
8905   if (STRINGP (start))
8906     {
8907       if (!STRING_MULTIBYTE (start)
8908           || SCHARS (start) == SBYTES (start))
8909         return Qnil;
8910       start_byte = 0;
8911       end_byte = SBYTES (start);
8912       pos = 0;
8913     }
8914   else
8915     {
8916       CHECK_NUMBER_COERCE_MARKER (start);
8917       CHECK_NUMBER_COERCE_MARKER (end);
8918       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
8919         args_out_of_range (start, end);
8920       if (NILP (current_buffer->enable_multibyte_characters))
8921         return Qnil;
8922       start_byte = CHAR_TO_BYTE (XINT (start));
8923       end_byte = CHAR_TO_BYTE (XINT (end));
8924       if (XINT (end) - XINT (start) == end_byte - start_byte)
8925         return Qnil;
8926
8927       if (XINT (start) < GPT && XINT (end) > GPT)
8928         {
8929           if ((GPT - XINT (start)) < (XINT (end) - GPT))
8930             move_gap_both (XINT (start), start_byte);
8931           else
8932             move_gap_both (XINT (end), end_byte);
8933         }
8934       pos = XINT (start);
8935     }
8936
8937   list = Qnil;
8938   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
8939     {
8940       elt = XCAR (tail);
8941       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
8942       ASET (attrs, coding_attr_trans_tbl,
8943             get_translation_table (attrs, 1, NULL));
8944       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
8945     }
8946
8947   if (STRINGP (start))
8948     p = pbeg = SDATA (start);
8949   else
8950     p = pbeg = BYTE_POS_ADDR (start_byte);
8951   pend = p + (end_byte - start_byte);
8952
8953   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
8954   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
8955
8956   while (p < pend)
8957     {
8958       if (ASCII_BYTE_P (*p))
8959         p++;
8960       else
8961         {
8962           c = STRING_CHAR_ADVANCE (p);
8963
8964           charset_map_loaded = 0;
8965           for (tail = list; CONSP (tail); tail = XCDR (tail))
8966             {
8967               elt = XCDR (XCAR (tail));
8968               if (! char_encodable_p (c, XCAR (elt)))
8969                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
8970             }
8971           if (charset_map_loaded)
8972             {
8973               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
8974
8975               if (STRINGP (start))
8976                 pbeg = SDATA (start);
8977               else
8978                 pbeg = BYTE_POS_ADDR (start_byte);
8979               p = pbeg + p_offset;
8980               pend = pbeg + pend_offset;
8981             }
8982         }
8983       pos++;
8984     }
8985
8986   tail = list;
8987   list = Qnil;
8988   for (; CONSP (tail); tail = XCDR (tail))
8989     {
8990       elt = XCAR (tail);
8991       if (CONSP (XCDR (XCDR (elt))))
8992         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
8993                       list);
8994     }
8995
8996   return list;
8997 }
8998
8999
9000 Lisp_Object
9001 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
9002      Lisp_Object start, end, coding_system, dst_object;
9003      int encodep, norecord;
9004 {
9005   struct coding_system coding;
9006   EMACS_INT from, from_byte, to, to_byte;
9007   Lisp_Object src_object;
9008
9009   CHECK_NUMBER_COERCE_MARKER (start);
9010   CHECK_NUMBER_COERCE_MARKER (end);
9011   if (NILP (coding_system))
9012     coding_system = Qno_conversion;
9013   else
9014     CHECK_CODING_SYSTEM (coding_system);
9015   src_object = Fcurrent_buffer ();
9016   if (NILP (dst_object))
9017     dst_object = src_object;
9018   else if (! EQ (dst_object, Qt))
9019     CHECK_BUFFER (dst_object);
9020
9021   validate_region (&start, &end);
9022   from = XFASTINT (start);
9023   from_byte = CHAR_TO_BYTE (from);
9024   to = XFASTINT (end);
9025   to_byte = CHAR_TO_BYTE (to);
9026
9027   setup_coding_system (coding_system, &coding);
9028   coding.mode |= CODING_MODE_LAST_BLOCK;
9029
9030   if (encodep)
9031     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9032                           dst_object);
9033   else
9034     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
9035                           dst_object);
9036   if (! norecord)
9037     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9038
9039   return (BUFFERP (dst_object)
9040           ? make_number (coding.produced_char)
9041           : coding.dst_object);
9042 }
9043
9044
9045 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
9046        3, 4, "r\nzCoding system: ",
9047        doc: /* Decode the current region from the specified coding system.
9048 When called from a program, takes four arguments:
9049         START, END, CODING-SYSTEM, and DESTINATION.
9050 START and END are buffer positions.
9051
9052 Optional 4th arguments DESTINATION specifies where the decoded text goes.
9053 If nil, the region between START and END is replaced by the decoded text.
9054 If buffer, the decoded text is inserted in that buffer after point (point
9055 does not move).
9056 In those cases, the length of the decoded text is returned.
9057 If DESTINATION is t, the decoded text is returned.
9058
9059 This function sets `last-coding-system-used' to the precise coding system
9060 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9061 not fully specified.)  */)
9062      (start, end, coding_system, destination)
9063      Lisp_Object start, end, coding_system, destination;
9064 {
9065   return code_convert_region (start, end, coding_system, destination, 0, 0);
9066 }
9067
9068 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
9069        3, 4, "r\nzCoding system: ",
9070        doc: /* Encode the current region by specified coding system.
9071 When called from a program, takes four arguments:
9072         START, END, CODING-SYSTEM and DESTINATION.
9073 START and END are buffer positions.
9074
9075 Optional 4th arguments DESTINATION specifies where the encoded text goes.
9076 If nil, the region between START and END is replace by the encoded text.
9077 If buffer, the encoded text is inserted in that buffer after point (point
9078 does not move).
9079 In those cases, the length of the encoded text is returned.
9080 If DESTINATION is t, the encoded text is returned.
9081
9082 This function sets `last-coding-system-used' to the precise coding system
9083 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9084 not fully specified.)  */)
9085   (start, end, coding_system, destination)
9086      Lisp_Object start, end, coding_system, destination;
9087 {
9088   return code_convert_region (start, end, coding_system, destination, 1, 0);
9089 }
9090
9091 Lisp_Object
9092 code_convert_string (string, coding_system, dst_object,
9093                      encodep, nocopy, norecord)
9094      Lisp_Object string, coding_system, dst_object;
9095      int encodep, nocopy, norecord;
9096 {
9097   struct coding_system coding;
9098   EMACS_INT chars, bytes;
9099
9100   CHECK_STRING (string);
9101   if (NILP (coding_system))
9102     {
9103       if (! norecord)
9104         Vlast_coding_system_used = Qno_conversion;
9105       if (NILP (dst_object))
9106         return (nocopy ? Fcopy_sequence (string) : string);
9107     }
9108
9109   if (NILP (coding_system))
9110     coding_system = Qno_conversion;
9111   else
9112     CHECK_CODING_SYSTEM (coding_system);
9113   if (NILP (dst_object))
9114     dst_object = Qt;
9115   else if (! EQ (dst_object, Qt))
9116     CHECK_BUFFER (dst_object);
9117
9118   setup_coding_system (coding_system, &coding);
9119   coding.mode |= CODING_MODE_LAST_BLOCK;
9120   chars = SCHARS (string);
9121   bytes = SBYTES (string);
9122   if (encodep)
9123     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9124   else
9125     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
9126   if (! norecord)
9127     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
9128
9129   return (BUFFERP (dst_object)
9130           ? make_number (coding.produced_char)
9131           : coding.dst_object);
9132 }
9133
9134
9135 /* Encode or decode STRING according to CODING_SYSTEM.
9136    Do not set Vlast_coding_system_used.
9137
9138    This function is called only from macros DECODE_FILE and
9139    ENCODE_FILE, thus we ignore character composition.  */
9140
9141 Lisp_Object
9142 code_convert_string_norecord (string, coding_system, encodep)
9143      Lisp_Object string, coding_system;
9144      int encodep;
9145 {
9146   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
9147 }
9148
9149
9150 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
9151        2, 4, 0,
9152        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
9153
9154 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
9155 if the decoding operation is trivial.
9156
9157 Optional fourth arg BUFFER non-nil means that the decoded text is
9158 inserted in that buffer after point (point does not move).  In this
9159 case, the return value is the length of the decoded text.
9160
9161 This function sets `last-coding-system-used' to the precise coding system
9162 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9163 not fully specified.)  */)
9164   (string, coding_system, nocopy, buffer)
9165      Lisp_Object string, coding_system, nocopy, buffer;
9166 {
9167   return code_convert_string (string, coding_system, buffer,
9168                               0, ! NILP (nocopy), 0);
9169 }
9170
9171 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
9172        2, 4, 0,
9173        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
9174
9175 Optional third arg NOCOPY non-nil means it is OK to return STRING
9176 itself if the encoding operation is trivial.
9177
9178 Optional fourth arg BUFFER non-nil means that the encoded text is
9179 inserted in that buffer after point (point does not move).  In this
9180 case, the return value is the length of the encoded text.
9181
9182 This function sets `last-coding-system-used' to the precise coding system
9183 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
9184 not fully specified.)  */)
9185      (string, coding_system, nocopy, buffer)
9186      Lisp_Object string, coding_system, nocopy, buffer;
9187 {
9188   return code_convert_string (string, coding_system, buffer,
9189                               1, ! NILP (nocopy), 1);
9190 }
9191
9192 \f
9193 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
9194        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
9195 Return the corresponding character.  */)
9196      (code)
9197      Lisp_Object code;
9198 {
9199   Lisp_Object spec, attrs, val;
9200   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
9201   int c;
9202
9203   CHECK_NATNUM (code);
9204   c = XFASTINT (code);
9205   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9206   attrs = AREF (spec, 0);
9207
9208   if (ASCII_BYTE_P (c)
9209       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9210     return code;
9211
9212   val = CODING_ATTR_CHARSET_LIST (attrs);
9213   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9214   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9215   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
9216
9217   if (c <= 0x7F)
9218     charset = charset_roman;
9219   else if (c >= 0xA0 && c < 0xDF)
9220     {
9221       charset = charset_kana;
9222       c -= 0x80;
9223     }
9224   else
9225     {
9226       int s1 = c >> 8, s2 = c & 0xFF;
9227
9228       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
9229           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
9230         error ("Invalid code: %d", code);
9231       SJIS_TO_JIS (c);
9232       charset = charset_kanji;
9233     }
9234   c = DECODE_CHAR (charset, c);
9235   if (c < 0)
9236     error ("Invalid code: %d", code);
9237   return make_number (c);
9238 }
9239
9240
9241 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
9242        doc: /* Encode a Japanese character CH to shift_jis encoding.
9243 Return the corresponding code in SJIS.  */)
9244      (ch)
9245     Lisp_Object ch;
9246 {
9247   Lisp_Object spec, attrs, charset_list;
9248   int c;
9249   struct charset *charset;
9250   unsigned code;
9251
9252   CHECK_CHARACTER (ch);
9253   c = XFASTINT (ch);
9254   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
9255   attrs = AREF (spec, 0);
9256
9257   if (ASCII_CHAR_P (c)
9258       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9259     return ch;
9260
9261   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9262   charset = char_charset (c, charset_list, &code);
9263   if (code == CHARSET_INVALID_CODE (charset))
9264     error ("Can't encode by shift_jis encoding: %d", c);
9265   JIS_TO_SJIS (code);
9266
9267   return make_number (code);
9268 }
9269
9270 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
9271        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
9272 Return the corresponding character.  */)
9273      (code)
9274      Lisp_Object code;
9275 {
9276   Lisp_Object spec, attrs, val;
9277   struct charset *charset_roman, *charset_big5, *charset;
9278   int c;
9279
9280   CHECK_NATNUM (code);
9281   c = XFASTINT (code);
9282   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9283   attrs = AREF (spec, 0);
9284
9285   if (ASCII_BYTE_P (c)
9286       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9287     return code;
9288
9289   val = CODING_ATTR_CHARSET_LIST (attrs);
9290   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
9291   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
9292
9293   if (c <= 0x7F)
9294     charset = charset_roman;
9295   else
9296     {
9297       int b1 = c >> 8, b2 = c & 0x7F;
9298       if (b1 < 0xA1 || b1 > 0xFE
9299           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
9300         error ("Invalid code: %d", code);
9301       charset = charset_big5;
9302     }
9303   c = DECODE_CHAR (charset, (unsigned )c);
9304   if (c < 0)
9305     error ("Invalid code: %d", code);
9306   return make_number (c);
9307 }
9308
9309 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
9310        doc: /* Encode the Big5 character CH to BIG5 coding system.
9311 Return the corresponding character code in Big5.  */)
9312      (ch)
9313      Lisp_Object ch;
9314 {
9315   Lisp_Object spec, attrs, charset_list;
9316   struct charset *charset;
9317   int c;
9318   unsigned code;
9319
9320   CHECK_CHARACTER (ch);
9321   c = XFASTINT (ch);
9322   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
9323   attrs = AREF (spec, 0);
9324   if (ASCII_CHAR_P (c)
9325       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
9326     return ch;
9327
9328   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
9329   charset = char_charset (c, charset_list, &code);
9330   if (code == CHARSET_INVALID_CODE (charset))
9331     error ("Can't encode by Big5 encoding: %d", c);
9332
9333   return make_number (code);
9334 }
9335
9336 \f
9337 DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal,
9338        Sset_terminal_coding_system_internal, 1, 2, 0,
9339        doc: /* Internal use only.  */)
9340      (coding_system, terminal)
9341      Lisp_Object coding_system;
9342      Lisp_Object terminal;
9343 {
9344   struct coding_system *terminal_coding = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9345   CHECK_SYMBOL (coding_system);
9346   setup_coding_system (Fcheck_coding_system (coding_system), terminal_coding);
9347   /* We had better not send unsafe characters to terminal.  */
9348   terminal_coding->mode |= CODING_MODE_SAFE_ENCODING;
9349   /* Characer composition should be disabled.  */
9350   terminal_coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9351   terminal_coding->src_multibyte = 1;
9352   terminal_coding->dst_multibyte = 0;
9353   return Qnil;
9354 }
9355
9356 DEFUN ("set-safe-terminal-coding-system-internal",
9357        Fset_safe_terminal_coding_system_internal,
9358        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
9359        doc: /* Internal use only.  */)
9360      (coding_system)
9361      Lisp_Object coding_system;
9362 {
9363   CHECK_SYMBOL (coding_system);
9364   setup_coding_system (Fcheck_coding_system (coding_system),
9365                        &safe_terminal_coding);
9366   /* Characer composition should be disabled.  */
9367   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9368   safe_terminal_coding.src_multibyte = 1;
9369   safe_terminal_coding.dst_multibyte = 0;
9370   return Qnil;
9371 }
9372
9373 DEFUN ("terminal-coding-system", Fterminal_coding_system,
9374        Sterminal_coding_system, 0, 1, 0,
9375        doc: /* Return coding system specified for terminal output on the given terminal.
9376 TERMINAL may be a terminal object, a frame, or nil for the selected
9377 frame's terminal device.  */)
9378      (terminal)
9379      Lisp_Object terminal;
9380 {
9381   struct coding_system *terminal_coding
9382     = TERMINAL_TERMINAL_CODING (get_terminal (terminal, 1));
9383   Lisp_Object coding_system = CODING_ID_NAME (terminal_coding->id);
9384
9385   /* For backward compatibility, return nil if it is `undecided'. */
9386   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
9387 }
9388
9389 DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal,
9390        Sset_keyboard_coding_system_internal, 1, 2, 0,
9391        doc: /* Internal use only.  */)
9392      (coding_system, terminal)
9393      Lisp_Object coding_system;
9394      Lisp_Object terminal;
9395 {
9396   struct terminal *t = get_terminal (terminal, 1);
9397   CHECK_SYMBOL (coding_system);
9398   if (NILP (coding_system))
9399     coding_system = Qno_conversion;
9400   else
9401     Fcheck_coding_system (coding_system);
9402   setup_coding_system (coding_system, TERMINAL_KEYBOARD_CODING (t));
9403   /* Characer composition should be disabled.  */
9404   TERMINAL_KEYBOARD_CODING (t)->common_flags
9405     &= ~CODING_ANNOTATE_COMPOSITION_MASK;
9406   return Qnil;
9407 }
9408
9409 DEFUN ("keyboard-coding-system",
9410        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 1, 0,
9411        doc: /* Return coding system specified for decoding keyboard input.  */)
9412      (terminal)
9413      Lisp_Object terminal;
9414 {
9415   return CODING_ID_NAME (TERMINAL_KEYBOARD_CODING
9416                          (get_terminal (terminal, 1))->id);
9417 }
9418
9419 \f
9420 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
9421        Sfind_operation_coding_system,  1, MANY, 0,
9422        doc: /* Choose a coding system for an operation based on the target name.
9423 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
9424 DECODING-SYSTEM is the coding system to use for decoding
9425 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
9426 for encoding (in case OPERATION does encoding).
9427
9428 The first argument OPERATION specifies an I/O primitive:
9429   For file I/O, `insert-file-contents' or `write-region'.
9430   For process I/O, `call-process', `call-process-region', or `start-process'.
9431   For network I/O, `open-network-stream'.
9432
9433 The remaining arguments should be the same arguments that were passed
9434 to the primitive.  Depending on which primitive, one of those arguments
9435 is selected as the TARGET.  For example, if OPERATION does file I/O,
9436 whichever argument specifies the file name is TARGET.
9437
9438 TARGET has a meaning which depends on OPERATION:
9439   For file I/O, TARGET is a file name (except for the special case below).
9440   For process I/O, TARGET is a process name.
9441   For network I/O, TARGET is a service name or a port number.
9442
9443 This function looks up what is specified for TARGET in
9444 `file-coding-system-alist', `process-coding-system-alist',
9445 or `network-coding-system-alist' depending on OPERATION.
9446 They may specify a coding system, a cons of coding systems,
9447 or a function symbol to call.
9448 In the last case, we call the function with one argument,
9449 which is a list of all the arguments given to this function.
9450 If the function can't decide a coding system, it can return
9451 `undecided' so that the normal code-detection is performed.
9452
9453 If OPERATION is `insert-file-contents', the argument corresponding to
9454 TARGET may be a cons (FILENAME . BUFFER).  In that case, FILENAME is a
9455 file name to look up, and BUFFER is a buffer that contains the file's
9456 contents (not yet decoded).  If `file-coding-system-alist' specifies a
9457 function to call for FILENAME, that function should examine the
9458 contents of BUFFER instead of reading the file.
9459
9460 usage: (find-operation-coding-system OPERATION ARGUMENTS...)  */)
9461      (nargs, args)
9462      int nargs;
9463      Lisp_Object *args;
9464 {
9465   Lisp_Object operation, target_idx, target, val;
9466   register Lisp_Object chain;
9467
9468   if (nargs < 2)
9469     error ("Too few arguments");
9470   operation = args[0];
9471   if (!SYMBOLP (operation)
9472       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
9473     error ("Invalid first argument");
9474   if (nargs < 1 + XINT (target_idx))
9475     error ("Too few arguments for operation: %s",
9476            SDATA (SYMBOL_NAME (operation)));
9477   target = args[XINT (target_idx) + 1];
9478   if (!(STRINGP (target)
9479         || (EQ (operation, Qinsert_file_contents) && CONSP (target)
9480             && STRINGP (XCAR (target)) && BUFFERP (XCDR (target)))
9481         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
9482     error ("Invalid %dth argument", XINT (target_idx) + 1);
9483   if (CONSP (target))
9484     target = XCAR (target);
9485
9486   chain = ((EQ (operation, Qinsert_file_contents)
9487             || EQ (operation, Qwrite_region))
9488            ? Vfile_coding_system_alist
9489            : (EQ (operation, Qopen_network_stream)
9490               ? Vnetwork_coding_system_alist
9491               : Vprocess_coding_system_alist));
9492   if (NILP (chain))
9493     return Qnil;
9494
9495   for (; CONSP (chain); chain = XCDR (chain))
9496     {
9497       Lisp_Object elt;
9498
9499       elt = XCAR (chain);
9500       if (CONSP (elt)
9501           && ((STRINGP (target)
9502                && STRINGP (XCAR (elt))
9503                && fast_string_match (XCAR (elt), target) >= 0)
9504               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
9505         {
9506           val = XCDR (elt);
9507           /* Here, if VAL is both a valid coding system and a valid
9508              function symbol, we return VAL as a coding system.  */
9509           if (CONSP (val))
9510             return val;
9511           if (! SYMBOLP (val))
9512             return Qnil;
9513           if (! NILP (Fcoding_system_p (val)))
9514             return Fcons (val, val);
9515           if (! NILP (Ffboundp (val)))
9516             {
9517               /* We use call1 rather than safe_call1
9518                  so as to get bug reports about functions called here
9519                  which don't handle the current interface.  */
9520               val = call1 (val, Flist (nargs, args));
9521               if (CONSP (val))
9522                 return val;
9523               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
9524                 return Fcons (val, val);
9525             }
9526           return Qnil;
9527         }
9528     }
9529   return Qnil;
9530 }
9531
9532 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
9533        Sset_coding_system_priority, 0, MANY, 0,
9534        doc: /* Assign higher priority to the coding systems given as arguments.
9535 If multiple coding systems belong to the same category,
9536 all but the first one are ignored.
9537
9538 usage: (set-coding-system-priority &rest coding-systems)  */)
9539      (nargs, args)
9540      int nargs;
9541      Lisp_Object *args;
9542 {
9543   int i, j;
9544   int changed[coding_category_max];
9545   enum coding_category priorities[coding_category_max];
9546
9547   bzero (changed, sizeof changed);
9548
9549   for (i = j = 0; i < nargs; i++)
9550     {
9551       enum coding_category category;
9552       Lisp_Object spec, attrs;
9553
9554       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
9555       attrs = AREF (spec, 0);
9556       category = XINT (CODING_ATTR_CATEGORY (attrs));
9557       if (changed[category])
9558         /* Ignore this coding system because a coding system of the
9559            same category already had a higher priority.  */
9560         continue;
9561       changed[category] = 1;
9562       priorities[j++] = category;
9563       if (coding_categories[category].id >= 0
9564           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
9565         setup_coding_system (args[i], &coding_categories[category]);
9566       Fset (AREF (Vcoding_category_table, category), args[i]);
9567     }
9568
9569   /* Now we have decided top J priorities.  Reflect the order of the
9570      original priorities to the remaining priorities.  */
9571
9572   for (i = j, j = 0; i < coding_category_max; i++, j++)
9573     {
9574       while (j < coding_category_max
9575              && changed[coding_priorities[j]])
9576         j++;
9577       if (j == coding_category_max)
9578         abort ();
9579       priorities[i] = coding_priorities[j];
9580     }
9581
9582   bcopy (priorities, coding_priorities, sizeof priorities);
9583
9584   /* Update `coding-category-list'.  */
9585   Vcoding_category_list = Qnil;
9586   for (i = coding_category_max - 1; i >= 0; i--)
9587     Vcoding_category_list
9588       = Fcons (AREF (Vcoding_category_table, priorities[i]),
9589                Vcoding_category_list);
9590
9591   return Qnil;
9592 }
9593
9594 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
9595        Scoding_system_priority_list, 0, 1, 0,
9596        doc: /* Return a list of coding systems ordered by their priorities.
9597 The list contains a subset of coding systems; i.e. coding systems
9598 assigned to each coding category (see `coding-category-list').
9599
9600 HIGHESTP non-nil means just return the highest priority one.  */)
9601      (highestp)
9602      Lisp_Object highestp;
9603 {
9604   int i;
9605   Lisp_Object val;
9606
9607   for (i = 0, val = Qnil; i < coding_category_max; i++)
9608     {
9609       enum coding_category category = coding_priorities[i];
9610       int id = coding_categories[category].id;
9611       Lisp_Object attrs;
9612
9613       if (id < 0)
9614         continue;
9615       attrs = CODING_ID_ATTRS (id);
9616       if (! NILP (highestp))
9617         return CODING_ATTR_BASE_NAME (attrs);
9618       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
9619     }
9620   return Fnreverse (val);
9621 }
9622
9623 static char *suffixes[] = { "-unix", "-dos", "-mac" };
9624
9625 static Lisp_Object
9626 make_subsidiaries (base)
9627      Lisp_Object base;
9628 {
9629   Lisp_Object subsidiaries;
9630   int base_name_len = SBYTES (SYMBOL_NAME (base));
9631   char *buf = (char *) alloca (base_name_len + 6);
9632   int i;
9633
9634   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
9635   subsidiaries = Fmake_vector (make_number (3), Qnil);
9636   for (i = 0; i < 3; i++)
9637     {
9638       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
9639       ASET (subsidiaries, i, intern (buf));
9640     }
9641   return subsidiaries;
9642 }
9643
9644
9645 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
9646        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
9647        doc: /* For internal use only.
9648 usage: (define-coding-system-internal ...)  */)
9649      (nargs, args)
9650      int nargs;
9651      Lisp_Object *args;
9652 {
9653   Lisp_Object name;
9654   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
9655   Lisp_Object attrs;            /* Vector of attributes.  */
9656   Lisp_Object eol_type;
9657   Lisp_Object aliases;
9658   Lisp_Object coding_type, charset_list, safe_charsets;
9659   enum coding_category category;
9660   Lisp_Object tail, val;
9661   int max_charset_id = 0;
9662   int i;
9663
9664   if (nargs < coding_arg_max)
9665     goto short_args;
9666
9667   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
9668
9669   name = args[coding_arg_name];
9670   CHECK_SYMBOL (name);
9671   CODING_ATTR_BASE_NAME (attrs) = name;
9672
9673   val = args[coding_arg_mnemonic];
9674   if (! STRINGP (val))
9675     CHECK_CHARACTER (val);
9676   CODING_ATTR_MNEMONIC (attrs) = val;
9677
9678   coding_type = args[coding_arg_coding_type];
9679   CHECK_SYMBOL (coding_type);
9680   CODING_ATTR_TYPE (attrs) = coding_type;
9681
9682   charset_list = args[coding_arg_charset_list];
9683   if (SYMBOLP (charset_list))
9684     {
9685       if (EQ (charset_list, Qiso_2022))
9686         {
9687           if (! EQ (coding_type, Qiso_2022))
9688             error ("Invalid charset-list");
9689           charset_list = Viso_2022_charset_list;
9690         }
9691       else if (EQ (charset_list, Qemacs_mule))
9692         {
9693           if (! EQ (coding_type, Qemacs_mule))
9694             error ("Invalid charset-list");
9695           charset_list = Vemacs_mule_charset_list;
9696         }
9697       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9698         if (max_charset_id < XFASTINT (XCAR (tail)))
9699           max_charset_id = XFASTINT (XCAR (tail));
9700     }
9701   else
9702     {
9703       charset_list = Fcopy_sequence (charset_list);
9704       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9705         {
9706           struct charset *charset;
9707
9708           val = XCAR (tail);
9709           CHECK_CHARSET_GET_CHARSET (val, charset);
9710           if (EQ (coding_type, Qiso_2022)
9711               ? CHARSET_ISO_FINAL (charset) < 0
9712               : EQ (coding_type, Qemacs_mule)
9713               ? CHARSET_EMACS_MULE_ID (charset) < 0
9714               : 0)
9715             error ("Can't handle charset `%s'",
9716                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
9717
9718           XSETCAR (tail, make_number (charset->id));
9719           if (max_charset_id < charset->id)
9720             max_charset_id = charset->id;
9721         }
9722     }
9723   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
9724
9725   safe_charsets = make_uninit_string (max_charset_id + 1);
9726   memset (SDATA (safe_charsets), 255, max_charset_id + 1);
9727   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9728     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
9729   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
9730
9731   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
9732
9733   val = args[coding_arg_decode_translation_table];
9734   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9735     CHECK_SYMBOL (val);
9736   CODING_ATTR_DECODE_TBL (attrs) = val;
9737
9738   val = args[coding_arg_encode_translation_table];
9739   if (! CHAR_TABLE_P (val) && ! CONSP (val))
9740     CHECK_SYMBOL (val);
9741   CODING_ATTR_ENCODE_TBL (attrs) = val;
9742
9743   val = args[coding_arg_post_read_conversion];
9744   CHECK_SYMBOL (val);
9745   CODING_ATTR_POST_READ (attrs) = val;
9746
9747   val = args[coding_arg_pre_write_conversion];
9748   CHECK_SYMBOL (val);
9749   CODING_ATTR_PRE_WRITE (attrs) = val;
9750
9751   val = args[coding_arg_default_char];
9752   if (NILP (val))
9753     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
9754   else
9755     {
9756       CHECK_CHARACTER (val);
9757       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9758     }
9759
9760   val = args[coding_arg_for_unibyte];
9761   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
9762
9763   val = args[coding_arg_plist];
9764   CHECK_LIST (val);
9765   CODING_ATTR_PLIST (attrs) = val;
9766
9767   if (EQ (coding_type, Qcharset))
9768     {
9769       /* Generate a lisp vector of 256 elements.  Each element is nil,
9770          integer, or a list of charset IDs.
9771
9772          If Nth element is nil, the byte code N is invalid in this
9773          coding system.
9774
9775          If Nth element is a number NUM, N is the first byte of a
9776          charset whose ID is NUM.
9777
9778          If Nth element is a list of charset IDs, N is the first byte
9779          of one of them.  The list is sorted by dimensions of the
9780          charsets.  A charset of smaller dimension comes firtst. */
9781       val = Fmake_vector (make_number (256), Qnil);
9782
9783       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
9784         {
9785           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
9786           int dim = CHARSET_DIMENSION (charset);
9787           int idx = (dim - 1) * 4;
9788
9789           if (CHARSET_ASCII_COMPATIBLE_P (charset))
9790             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9791
9792           for (i = charset->code_space[idx];
9793                i <= charset->code_space[idx + 1]; i++)
9794             {
9795               Lisp_Object tmp, tmp2;
9796               int dim2;
9797
9798               tmp = AREF (val, i);
9799               if (NILP (tmp))
9800                 tmp = XCAR (tail);
9801               else if (NUMBERP (tmp))
9802                 {
9803                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
9804                   if (dim < dim2)
9805                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
9806                   else
9807                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
9808                 }
9809               else
9810                 {
9811                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
9812                     {
9813                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
9814                       if (dim < dim2)
9815                         break;
9816                     }
9817                   if (NILP (tmp2))
9818                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
9819                   else
9820                     {
9821                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
9822                       XSETCAR (tmp2, XCAR (tail));
9823                     }
9824                 }
9825               ASET (val, i, tmp);
9826             }
9827         }
9828       ASET (attrs, coding_attr_charset_valids, val);
9829       category = coding_category_charset;
9830     }
9831   else if (EQ (coding_type, Qccl))
9832     {
9833       Lisp_Object valids;
9834
9835       if (nargs < coding_arg_ccl_max)
9836         goto short_args;
9837
9838       val = args[coding_arg_ccl_decoder];
9839       CHECK_CCL_PROGRAM (val);
9840       if (VECTORP (val))
9841         val = Fcopy_sequence (val);
9842       ASET (attrs, coding_attr_ccl_decoder, val);
9843
9844       val = args[coding_arg_ccl_encoder];
9845       CHECK_CCL_PROGRAM (val);
9846       if (VECTORP (val))
9847         val = Fcopy_sequence (val);
9848       ASET (attrs, coding_attr_ccl_encoder, val);
9849
9850       val = args[coding_arg_ccl_valids];
9851       valids = Fmake_string (make_number (256), make_number (0));
9852       for (tail = val; !NILP (tail); tail = Fcdr (tail))
9853         {
9854           int from, to;
9855
9856           val = Fcar (tail);
9857           if (INTEGERP (val))
9858             {
9859               from = to = XINT (val);
9860               if (from < 0 || from > 255)
9861                 args_out_of_range_3 (val, make_number (0), make_number (255));
9862             }
9863           else
9864             {
9865               CHECK_CONS (val);
9866               CHECK_NATNUM_CAR (val);
9867               CHECK_NATNUM_CDR (val);
9868               from = XINT (XCAR (val));
9869               if (from > 255)
9870                 args_out_of_range_3 (XCAR (val),
9871                                      make_number (0), make_number (255));
9872               to = XINT (XCDR (val));
9873               if (to < from || to > 255)
9874                 args_out_of_range_3 (XCDR (val),
9875                                      XCAR (val), make_number (255));
9876             }
9877           for (i = from; i <= to; i++)
9878             SSET (valids, i, 1);
9879         }
9880       ASET (attrs, coding_attr_ccl_valids, valids);
9881
9882       category = coding_category_ccl;
9883     }
9884   else if (EQ (coding_type, Qutf_16))
9885     {
9886       Lisp_Object bom, endian;
9887
9888       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
9889
9890       if (nargs < coding_arg_utf16_max)
9891         goto short_args;
9892
9893       bom = args[coding_arg_utf16_bom];
9894       if (! NILP (bom) && ! EQ (bom, Qt))
9895         {
9896           CHECK_CONS (bom);
9897           val = XCAR (bom);
9898           CHECK_CODING_SYSTEM (val);
9899           val = XCDR (bom);
9900           CHECK_CODING_SYSTEM (val);
9901         }
9902       ASET (attrs, coding_attr_utf_bom, bom);
9903
9904       endian = args[coding_arg_utf16_endian];
9905       CHECK_SYMBOL (endian);
9906       if (NILP (endian))
9907         endian = Qbig;
9908       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
9909         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
9910       ASET (attrs, coding_attr_utf_16_endian, endian);
9911
9912       category = (CONSP (bom)
9913                   ? coding_category_utf_16_auto
9914                   : NILP (bom)
9915                   ? (EQ (endian, Qbig)
9916                      ? coding_category_utf_16_be_nosig
9917                      : coding_category_utf_16_le_nosig)
9918                   : (EQ (endian, Qbig)
9919                      ? coding_category_utf_16_be
9920                      : coding_category_utf_16_le));
9921     }
9922   else if (EQ (coding_type, Qiso_2022))
9923     {
9924       Lisp_Object initial, reg_usage, request, flags;
9925       int i;
9926
9927       if (nargs < coding_arg_iso2022_max)
9928         goto short_args;
9929
9930       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
9931       CHECK_VECTOR (initial);
9932       for (i = 0; i < 4; i++)
9933         {
9934           val = Faref (initial, make_number (i));
9935           if (! NILP (val))
9936             {
9937               struct charset *charset;
9938
9939               CHECK_CHARSET_GET_CHARSET (val, charset);
9940               ASET (initial, i, make_number (CHARSET_ID (charset)));
9941               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
9942                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
9943             }
9944           else
9945             ASET (initial, i, make_number (-1));
9946         }
9947
9948       reg_usage = args[coding_arg_iso2022_reg_usage];
9949       CHECK_CONS (reg_usage);
9950       CHECK_NUMBER_CAR (reg_usage);
9951       CHECK_NUMBER_CDR (reg_usage);
9952
9953       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
9954       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
9955         {
9956           int id;
9957           Lisp_Object tmp;
9958
9959           val = Fcar (tail);
9960           CHECK_CONS (val);
9961           tmp = XCAR (val);
9962           CHECK_CHARSET_GET_ID (tmp, id);
9963           CHECK_NATNUM_CDR (val);
9964           if (XINT (XCDR (val)) >= 4)
9965             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
9966           XSETCAR (val, make_number (id));
9967         }
9968
9969       flags = args[coding_arg_iso2022_flags];
9970       CHECK_NATNUM (flags);
9971       i = XINT (flags);
9972       if (EQ (args[coding_arg_charset_list], Qiso_2022))
9973         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
9974
9975       ASET (attrs, coding_attr_iso_initial, initial);
9976       ASET (attrs, coding_attr_iso_usage, reg_usage);
9977       ASET (attrs, coding_attr_iso_request, request);
9978       ASET (attrs, coding_attr_iso_flags, flags);
9979       setup_iso_safe_charsets (attrs);
9980
9981       if (i & CODING_ISO_FLAG_SEVEN_BITS)
9982         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
9983                           | CODING_ISO_FLAG_SINGLE_SHIFT))
9984                     ? coding_category_iso_7_else
9985                     : EQ (args[coding_arg_charset_list], Qiso_2022)
9986                     ? coding_category_iso_7
9987                     : coding_category_iso_7_tight);
9988       else
9989         {
9990           int id = XINT (AREF (initial, 1));
9991
9992           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
9993                        || EQ (args[coding_arg_charset_list], Qiso_2022)
9994                        || id < 0)
9995                       ? coding_category_iso_8_else
9996                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
9997                       ? coding_category_iso_8_1
9998                       : coding_category_iso_8_2);
9999         }
10000       if (category != coding_category_iso_8_1
10001           && category != coding_category_iso_8_2)
10002         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
10003     }
10004   else if (EQ (coding_type, Qemacs_mule))
10005     {
10006       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
10007         ASET (attrs, coding_attr_emacs_mule_full, Qt);
10008       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10009       category = coding_category_emacs_mule;
10010     }
10011   else if (EQ (coding_type, Qshift_jis))
10012     {
10013
10014       struct charset *charset;
10015
10016       if (XINT (Flength (charset_list)) != 3
10017           && XINT (Flength (charset_list)) != 4)
10018         error ("There should be three or four charsets");
10019
10020       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10021       if (CHARSET_DIMENSION (charset) != 1)
10022         error ("Dimension of charset %s is not one",
10023                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10024       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10025         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10026
10027       charset_list = XCDR (charset_list);
10028       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10029       if (CHARSET_DIMENSION (charset) != 1)
10030         error ("Dimension of charset %s is not one",
10031                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10032
10033       charset_list = XCDR (charset_list);
10034       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10035       if (CHARSET_DIMENSION (charset) != 2)
10036         error ("Dimension of charset %s is not two",
10037                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10038
10039       charset_list = XCDR (charset_list);
10040       if (! NILP (charset_list))
10041         {
10042           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10043           if (CHARSET_DIMENSION (charset) != 2)
10044             error ("Dimension of charset %s is not two",
10045                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10046         }
10047
10048       category = coding_category_sjis;
10049       Vsjis_coding_system = name;
10050     }
10051   else if (EQ (coding_type, Qbig5))
10052     {
10053       struct charset *charset;
10054
10055       if (XINT (Flength (charset_list)) != 2)
10056         error ("There should be just two charsets");
10057
10058       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10059       if (CHARSET_DIMENSION (charset) != 1)
10060         error ("Dimension of charset %s is not one",
10061                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10062       if (CHARSET_ASCII_COMPATIBLE_P (charset))
10063         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10064
10065       charset_list = XCDR (charset_list);
10066       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
10067       if (CHARSET_DIMENSION (charset) != 2)
10068         error ("Dimension of charset %s is not two",
10069                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
10070
10071       category = coding_category_big5;
10072       Vbig5_coding_system = name;
10073     }
10074   else if (EQ (coding_type, Qraw_text))
10075     {
10076       category = coding_category_raw_text;
10077       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10078     }
10079   else if (EQ (coding_type, Qutf_8))
10080     {
10081       Lisp_Object bom;
10082
10083       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
10084
10085       if (nargs < coding_arg_utf8_max)
10086         goto short_args;
10087
10088       bom = args[coding_arg_utf8_bom];
10089       if (! NILP (bom) && ! EQ (bom, Qt))
10090         {
10091           CHECK_CONS (bom);
10092           val = XCAR (bom);
10093           CHECK_CODING_SYSTEM (val);
10094           val = XCDR (bom);
10095           CHECK_CODING_SYSTEM (val);
10096         }
10097       ASET (attrs, coding_attr_utf_bom, bom);
10098
10099       category = (CONSP (bom) ? coding_category_utf_8_auto
10100                   : NILP (bom) ? coding_category_utf_8_nosig
10101                   : coding_category_utf_8_sig);
10102     }
10103   else if (EQ (coding_type, Qundecided))
10104     category = coding_category_undecided;
10105   else
10106     error ("Invalid coding system type: %s",
10107            SDATA (SYMBOL_NAME (coding_type)));
10108
10109   CODING_ATTR_CATEGORY (attrs) = make_number (category);
10110   CODING_ATTR_PLIST (attrs)
10111     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
10112                                 CODING_ATTR_PLIST (attrs)));
10113   CODING_ATTR_PLIST (attrs)
10114     = Fcons (QCascii_compatible_p,
10115              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
10116                     CODING_ATTR_PLIST (attrs)));
10117
10118   eol_type = args[coding_arg_eol_type];
10119   if (! NILP (eol_type)
10120       && ! EQ (eol_type, Qunix)
10121       && ! EQ (eol_type, Qdos)
10122       && ! EQ (eol_type, Qmac))
10123     error ("Invalid eol-type");
10124
10125   aliases = Fcons (name, Qnil);
10126
10127   if (NILP (eol_type))
10128     {
10129       eol_type = make_subsidiaries (name);
10130       for (i = 0; i < 3; i++)
10131         {
10132           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
10133
10134           this_name = AREF (eol_type, i);
10135           this_aliases = Fcons (this_name, Qnil);
10136           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
10137           this_spec = Fmake_vector (make_number (3), attrs);
10138           ASET (this_spec, 1, this_aliases);
10139           ASET (this_spec, 2, this_eol_type);
10140           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
10141           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
10142           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
10143           if (NILP (val))
10144             Vcoding_system_alist
10145               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
10146                        Vcoding_system_alist);
10147         }
10148     }
10149
10150   spec_vec = Fmake_vector (make_number (3), attrs);
10151   ASET (spec_vec, 1, aliases);
10152   ASET (spec_vec, 2, eol_type);
10153
10154   Fputhash (name, spec_vec, Vcoding_system_hash_table);
10155   Vcoding_system_list = Fcons (name, Vcoding_system_list);
10156   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
10157   if (NILP (val))
10158     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
10159                                   Vcoding_system_alist);
10160
10161   {
10162     int id = coding_categories[category].id;
10163
10164     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
10165       setup_coding_system (name, &coding_categories[category]);
10166   }
10167
10168   return Qnil;
10169
10170  short_args:
10171   return Fsignal (Qwrong_number_of_arguments,
10172                   Fcons (intern ("define-coding-system-internal"),
10173                          make_number (nargs)));
10174 }
10175
10176
10177 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
10178        3, 3, 0,
10179        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
10180   (coding_system, prop, val)
10181      Lisp_Object coding_system, prop, val;
10182 {
10183   Lisp_Object spec, attrs;
10184
10185   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10186   attrs = AREF (spec, 0);
10187   if (EQ (prop, QCmnemonic))
10188     {
10189       if (! STRINGP (val))
10190         CHECK_CHARACTER (val);
10191       CODING_ATTR_MNEMONIC (attrs) = val;
10192     }
10193   else if (EQ (prop, QCdefault_char))
10194     {
10195       if (NILP (val))
10196         val = make_number (' ');
10197       else
10198         CHECK_CHARACTER (val);
10199       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
10200     }
10201   else if (EQ (prop, QCdecode_translation_table))
10202     {
10203       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10204         CHECK_SYMBOL (val);
10205       CODING_ATTR_DECODE_TBL (attrs) = val;
10206     }
10207   else if (EQ (prop, QCencode_translation_table))
10208     {
10209       if (! CHAR_TABLE_P (val) && ! CONSP (val))
10210         CHECK_SYMBOL (val);
10211       CODING_ATTR_ENCODE_TBL (attrs) = val;
10212     }
10213   else if (EQ (prop, QCpost_read_conversion))
10214     {
10215       CHECK_SYMBOL (val);
10216       CODING_ATTR_POST_READ (attrs) = val;
10217     }
10218   else if (EQ (prop, QCpre_write_conversion))
10219     {
10220       CHECK_SYMBOL (val);
10221       CODING_ATTR_PRE_WRITE (attrs) = val;
10222     }
10223   else if (EQ (prop, QCascii_compatible_p))
10224     {
10225       CODING_ATTR_ASCII_COMPAT (attrs) = val;
10226     }
10227
10228   CODING_ATTR_PLIST (attrs)
10229     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
10230   return val;
10231 }
10232
10233
10234 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
10235        Sdefine_coding_system_alias, 2, 2, 0,
10236        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
10237      (alias, coding_system)
10238      Lisp_Object alias, coding_system;
10239 {
10240   Lisp_Object spec, aliases, eol_type, val;
10241
10242   CHECK_SYMBOL (alias);
10243   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10244   aliases = AREF (spec, 1);
10245   /* ALIASES should be a list of length more than zero, and the first
10246      element is a base coding system.  Append ALIAS at the tail of the
10247      list.  */
10248   while (!NILP (XCDR (aliases)))
10249     aliases = XCDR (aliases);
10250   XSETCDR (aliases, Fcons (alias, Qnil));
10251
10252   eol_type = AREF (spec, 2);
10253   if (VECTORP (eol_type))
10254     {
10255       Lisp_Object subsidiaries;
10256       int i;
10257
10258       subsidiaries = make_subsidiaries (alias);
10259       for (i = 0; i < 3; i++)
10260         Fdefine_coding_system_alias (AREF (subsidiaries, i),
10261                                      AREF (eol_type, i));
10262     }
10263
10264   Fputhash (alias, spec, Vcoding_system_hash_table);
10265   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
10266   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
10267   if (NILP (val))
10268     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
10269                                   Vcoding_system_alist);
10270
10271   return Qnil;
10272 }
10273
10274 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
10275        1, 1, 0,
10276        doc: /* Return the base of CODING-SYSTEM.
10277 Any alias or subsidiary coding system is not a base coding system.  */)
10278   (coding_system)
10279      Lisp_Object coding_system;
10280 {
10281   Lisp_Object spec, attrs;
10282
10283   if (NILP (coding_system))
10284     return (Qno_conversion);
10285   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10286   attrs = AREF (spec, 0);
10287   return CODING_ATTR_BASE_NAME (attrs);
10288 }
10289
10290 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
10291        1, 1, 0,
10292        doc: "Return the property list of CODING-SYSTEM.")
10293      (coding_system)
10294      Lisp_Object coding_system;
10295 {
10296   Lisp_Object spec, attrs;
10297
10298   if (NILP (coding_system))
10299     coding_system = Qno_conversion;
10300   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10301   attrs = AREF (spec, 0);
10302   return CODING_ATTR_PLIST (attrs);
10303 }
10304
10305
10306 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
10307        1, 1, 0,
10308        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
10309      (coding_system)
10310      Lisp_Object coding_system;
10311 {
10312   Lisp_Object spec;
10313
10314   if (NILP (coding_system))
10315     coding_system = Qno_conversion;
10316   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
10317   return AREF (spec, 1);
10318 }
10319
10320 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
10321        Scoding_system_eol_type, 1, 1, 0,
10322        doc: /* Return eol-type of CODING-SYSTEM.
10323 An eol-type is an integer 0, 1, 2, or a vector of coding systems.
10324
10325 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
10326 and CR respectively.
10327
10328 A vector value indicates that a format of end-of-line should be
10329 detected automatically.  Nth element of the vector is the subsidiary
10330 coding system whose eol-type is N.  */)
10331      (coding_system)
10332      Lisp_Object coding_system;
10333 {
10334   Lisp_Object spec, eol_type;
10335   int n;
10336
10337   if (NILP (coding_system))
10338     coding_system = Qno_conversion;
10339   if (! CODING_SYSTEM_P (coding_system))
10340     return Qnil;
10341   spec = CODING_SYSTEM_SPEC (coding_system);
10342   eol_type = AREF (spec, 2);
10343   if (VECTORP (eol_type))
10344     return Fcopy_sequence (eol_type);
10345   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
10346   return make_number (n);
10347 }
10348
10349 #endif /* emacs */
10350
10351 \f
10352 /*** 9. Post-amble ***/
10353
10354 void
10355 init_coding_once ()
10356 {
10357   int i;
10358
10359   for (i = 0; i < coding_category_max; i++)
10360     {
10361       coding_categories[i].id = -1;
10362       coding_priorities[i] = i;
10363     }
10364
10365   /* ISO2022 specific initialize routine.  */
10366   for (i = 0; i < 0x20; i++)
10367     iso_code_class[i] = ISO_control_0;
10368   for (i = 0x21; i < 0x7F; i++)
10369     iso_code_class[i] = ISO_graphic_plane_0;
10370   for (i = 0x80; i < 0xA0; i++)
10371     iso_code_class[i] = ISO_control_1;
10372   for (i = 0xA1; i < 0xFF; i++)
10373     iso_code_class[i] = ISO_graphic_plane_1;
10374   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
10375   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
10376   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
10377   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
10378   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
10379   iso_code_class[ISO_CODE_ESC] = ISO_escape;
10380   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
10381   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
10382   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
10383
10384   for (i = 0; i < 256; i++)
10385     {
10386       emacs_mule_bytes[i] = 1;
10387     }
10388   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
10389   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
10390   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
10391   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
10392 }
10393
10394 #ifdef emacs
10395
10396 void
10397 syms_of_coding ()
10398 {
10399   staticpro (&Vcoding_system_hash_table);
10400   {
10401     Lisp_Object args[2];
10402     args[0] = QCtest;
10403     args[1] = Qeq;
10404     Vcoding_system_hash_table = Fmake_hash_table (2, args);
10405   }
10406
10407   staticpro (&Vsjis_coding_system);
10408   Vsjis_coding_system = Qnil;
10409
10410   staticpro (&Vbig5_coding_system);
10411   Vbig5_coding_system = Qnil;
10412
10413   staticpro (&Vcode_conversion_reused_workbuf);
10414   Vcode_conversion_reused_workbuf = Qnil;
10415
10416   staticpro (&Vcode_conversion_workbuf_name);
10417   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
10418
10419   reused_workbuf_in_use = 0;
10420
10421   DEFSYM (Qcharset, "charset");
10422   DEFSYM (Qtarget_idx, "target-idx");
10423   DEFSYM (Qcoding_system_history, "coding-system-history");
10424   Fset (Qcoding_system_history, Qnil);
10425
10426   /* Target FILENAME is the first argument.  */
10427   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
10428   /* Target FILENAME is the third argument.  */
10429   Fput (Qwrite_region, Qtarget_idx, make_number (2));
10430
10431   DEFSYM (Qcall_process, "call-process");
10432   /* Target PROGRAM is the first argument.  */
10433   Fput (Qcall_process, Qtarget_idx, make_number (0));
10434
10435   DEFSYM (Qcall_process_region, "call-process-region");
10436   /* Target PROGRAM is the third argument.  */
10437   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
10438
10439   DEFSYM (Qstart_process, "start-process");
10440   /* Target PROGRAM is the third argument.  */
10441   Fput (Qstart_process, Qtarget_idx, make_number (2));
10442
10443   DEFSYM (Qopen_network_stream, "open-network-stream");
10444   /* Target SERVICE is the fourth argument.  */
10445   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
10446
10447   DEFSYM (Qcoding_system, "coding-system");
10448   DEFSYM (Qcoding_aliases, "coding-aliases");
10449
10450   DEFSYM (Qeol_type, "eol-type");
10451   DEFSYM (Qunix, "unix");
10452   DEFSYM (Qdos, "dos");
10453
10454   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
10455   DEFSYM (Qpost_read_conversion, "post-read-conversion");
10456   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
10457   DEFSYM (Qdefault_char, "default-char");
10458   DEFSYM (Qundecided, "undecided");
10459   DEFSYM (Qno_conversion, "no-conversion");
10460   DEFSYM (Qraw_text, "raw-text");
10461
10462   DEFSYM (Qiso_2022, "iso-2022");
10463
10464   DEFSYM (Qutf_8, "utf-8");
10465   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
10466
10467   DEFSYM (Qutf_16, "utf-16");
10468   DEFSYM (Qbig, "big");
10469   DEFSYM (Qlittle, "little");
10470
10471   DEFSYM (Qshift_jis, "shift-jis");
10472   DEFSYM (Qbig5, "big5");
10473
10474   DEFSYM (Qcoding_system_p, "coding-system-p");
10475
10476   DEFSYM (Qcoding_system_error, "coding-system-error");
10477   Fput (Qcoding_system_error, Qerror_conditions,
10478         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
10479   Fput (Qcoding_system_error, Qerror_message,
10480         build_string ("Invalid coding system"));
10481
10482   /* Intern this now in case it isn't already done.
10483      Setting this variable twice is harmless.
10484      But don't staticpro it here--that is done in alloc.c.  */
10485   Qchar_table_extra_slots = intern ("char-table-extra-slots");
10486
10487   DEFSYM (Qtranslation_table, "translation-table");
10488   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
10489   DEFSYM (Qtranslation_table_id, "translation-table-id");
10490   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
10491   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
10492
10493   DEFSYM (Qvalid_codes, "valid-codes");
10494
10495   DEFSYM (Qemacs_mule, "emacs-mule");
10496
10497   DEFSYM (QCcategory, ":category");
10498   DEFSYM (QCmnemonic, ":mnemonic");
10499   DEFSYM (QCdefault_char, ":default-char");
10500   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
10501   DEFSYM (QCencode_translation_table, ":encode-translation-table");
10502   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
10503   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
10504   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
10505
10506   Vcoding_category_table
10507     = Fmake_vector (make_number (coding_category_max), Qnil);
10508   staticpro (&Vcoding_category_table);
10509   /* Followings are target of code detection.  */
10510   ASET (Vcoding_category_table, coding_category_iso_7,
10511         intern ("coding-category-iso-7"));
10512   ASET (Vcoding_category_table, coding_category_iso_7_tight,
10513         intern ("coding-category-iso-7-tight"));
10514   ASET (Vcoding_category_table, coding_category_iso_8_1,
10515         intern ("coding-category-iso-8-1"));
10516   ASET (Vcoding_category_table, coding_category_iso_8_2,
10517         intern ("coding-category-iso-8-2"));
10518   ASET (Vcoding_category_table, coding_category_iso_7_else,
10519         intern ("coding-category-iso-7-else"));
10520   ASET (Vcoding_category_table, coding_category_iso_8_else,
10521         intern ("coding-category-iso-8-else"));
10522   ASET (Vcoding_category_table, coding_category_utf_8_auto,
10523         intern ("coding-category-utf-8-auto"));
10524   ASET (Vcoding_category_table, coding_category_utf_8_nosig,
10525         intern ("coding-category-utf-8"));
10526   ASET (Vcoding_category_table, coding_category_utf_8_sig,
10527         intern ("coding-category-utf-8-sig"));
10528   ASET (Vcoding_category_table, coding_category_utf_16_be,
10529         intern ("coding-category-utf-16-be"));
10530   ASET (Vcoding_category_table, coding_category_utf_16_auto,
10531         intern ("coding-category-utf-16-auto"));
10532   ASET (Vcoding_category_table, coding_category_utf_16_le,
10533         intern ("coding-category-utf-16-le"));
10534   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
10535         intern ("coding-category-utf-16-be-nosig"));
10536   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
10537         intern ("coding-category-utf-16-le-nosig"));
10538   ASET (Vcoding_category_table, coding_category_charset,
10539         intern ("coding-category-charset"));
10540   ASET (Vcoding_category_table, coding_category_sjis,
10541         intern ("coding-category-sjis"));
10542   ASET (Vcoding_category_table, coding_category_big5,
10543         intern ("coding-category-big5"));
10544   ASET (Vcoding_category_table, coding_category_ccl,
10545         intern ("coding-category-ccl"));
10546   ASET (Vcoding_category_table, coding_category_emacs_mule,
10547         intern ("coding-category-emacs-mule"));
10548   /* Followings are NOT target of code detection.  */
10549   ASET (Vcoding_category_table, coding_category_raw_text,
10550         intern ("coding-category-raw-text"));
10551   ASET (Vcoding_category_table, coding_category_undecided,
10552         intern ("coding-category-undecided"));
10553
10554   DEFSYM (Qinsufficient_source, "insufficient-source");
10555   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
10556   DEFSYM (Qinvalid_source, "invalid-source");
10557   DEFSYM (Qinterrupted, "interrupted");
10558   DEFSYM (Qinsufficient_memory, "insufficient-memory");
10559   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
10560
10561   defsubr (&Scoding_system_p);
10562   defsubr (&Sread_coding_system);
10563   defsubr (&Sread_non_nil_coding_system);
10564   defsubr (&Scheck_coding_system);
10565   defsubr (&Sdetect_coding_region);
10566   defsubr (&Sdetect_coding_string);
10567   defsubr (&Sfind_coding_systems_region_internal);
10568   defsubr (&Sunencodable_char_position);
10569   defsubr (&Scheck_coding_systems_region);
10570   defsubr (&Sdecode_coding_region);
10571   defsubr (&Sencode_coding_region);
10572   defsubr (&Sdecode_coding_string);
10573   defsubr (&Sencode_coding_string);
10574   defsubr (&Sdecode_sjis_char);
10575   defsubr (&Sencode_sjis_char);
10576   defsubr (&Sdecode_big5_char);
10577   defsubr (&Sencode_big5_char);
10578   defsubr (&Sset_terminal_coding_system_internal);
10579   defsubr (&Sset_safe_terminal_coding_system_internal);
10580   defsubr (&Sterminal_coding_system);
10581   defsubr (&Sset_keyboard_coding_system_internal);
10582   defsubr (&Skeyboard_coding_system);
10583   defsubr (&Sfind_operation_coding_system);
10584   defsubr (&Sset_coding_system_priority);
10585   defsubr (&Sdefine_coding_system_internal);
10586   defsubr (&Sdefine_coding_system_alias);
10587   defsubr (&Scoding_system_put);
10588   defsubr (&Scoding_system_base);
10589   defsubr (&Scoding_system_plist);
10590   defsubr (&Scoding_system_aliases);
10591   defsubr (&Scoding_system_eol_type);
10592   defsubr (&Scoding_system_priority_list);
10593
10594   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
10595                doc: /* List of coding systems.
10596
10597 Do not alter the value of this variable manually.  This variable should be
10598 updated by the functions `define-coding-system' and
10599 `define-coding-system-alias'.  */);
10600   Vcoding_system_list = Qnil;
10601
10602   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
10603                doc: /* Alist of coding system names.
10604 Each element is one element list of coding system name.
10605 This variable is given to `completing-read' as COLLECTION argument.
10606
10607 Do not alter the value of this variable manually.  This variable should be
10608 updated by the functions `make-coding-system' and
10609 `define-coding-system-alias'.  */);
10610   Vcoding_system_alist = Qnil;
10611
10612   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
10613                doc: /* List of coding-categories (symbols) ordered by priority.
10614
10615 On detecting a coding system, Emacs tries code detection algorithms
10616 associated with each coding-category one by one in this order.  When
10617 one algorithm agrees with a byte sequence of source text, the coding
10618 system bound to the corresponding coding-category is selected.
10619
10620 Don't modify this variable directly, but use `set-coding-priority'.  */);
10621   {
10622     int i;
10623
10624     Vcoding_category_list = Qnil;
10625     for (i = coding_category_max - 1; i >= 0; i--)
10626       Vcoding_category_list
10627         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
10628                  Vcoding_category_list);
10629   }
10630
10631   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
10632                doc: /* Specify the coding system for read operations.
10633 It is useful to bind this variable with `let', but do not set it globally.
10634 If the value is a coding system, it is used for decoding on read operation.
10635 If not, an appropriate element is used from one of the coding system alists.
10636 There are three such tables: `file-coding-system-alist',
10637 `process-coding-system-alist', and `network-coding-system-alist'.  */);
10638   Vcoding_system_for_read = Qnil;
10639
10640   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
10641                doc: /* Specify the coding system for write operations.
10642 Programs bind this variable with `let', but you should not set it globally.
10643 If the value is a coding system, it is used for encoding of output,
10644 when writing it to a file and when sending it to a file or subprocess.
10645
10646 If this does not specify a coding system, an appropriate element
10647 is used from one of the coding system alists.
10648 There are three such tables: `file-coding-system-alist',
10649 `process-coding-system-alist', and `network-coding-system-alist'.
10650 For output to files, if the above procedure does not specify a coding system,
10651 the value of `buffer-file-coding-system' is used.  */);
10652   Vcoding_system_for_write = Qnil;
10653
10654   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
10655                doc: /*
10656 Coding system used in the latest file or process I/O.  */);
10657   Vlast_coding_system_used = Qnil;
10658
10659   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
10660                doc: /*
10661 Error status of the last code conversion.
10662
10663 When an error was detected in the last code conversion, this variable
10664 is set to one of the following symbols.
10665   `insufficient-source'
10666   `inconsistent-eol'
10667   `invalid-source'
10668   `interrupted'
10669   `insufficient-memory'
10670 When no error was detected, the value doesn't change.  So, to check
10671 the error status of a code conversion by this variable, you must
10672 explicitly set this variable to nil before performing code
10673 conversion.  */);
10674   Vlast_code_conversion_error = Qnil;
10675
10676   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
10677                doc: /*
10678 *Non-nil means always inhibit code conversion of end-of-line format.
10679 See info node `Coding Systems' and info node `Text and Binary' concerning
10680 such conversion.  */);
10681   inhibit_eol_conversion = 0;
10682
10683   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
10684                doc: /*
10685 Non-nil means process buffer inherits coding system of process output.
10686 Bind it to t if the process output is to be treated as if it were a file
10687 read from some filesystem.  */);
10688   inherit_process_coding_system = 0;
10689
10690   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
10691                doc: /*
10692 Alist to decide a coding system to use for a file I/O operation.
10693 The format is ((PATTERN . VAL) ...),
10694 where PATTERN is a regular expression matching a file name,
10695 VAL is a coding system, a cons of coding systems, or a function symbol.
10696 If VAL is a coding system, it is used for both decoding and encoding
10697 the file contents.
10698 If VAL is a cons of coding systems, the car part is used for decoding,
10699 and the cdr part is used for encoding.
10700 If VAL is a function symbol, the function must return a coding system
10701 or a cons of coding systems which are used as above.  The function is
10702 called with an argument that is a list of the arguments with which
10703 `find-operation-coding-system' was called.  If the function can't decide
10704 a coding system, it can return `undecided' so that the normal
10705 code-detection is performed.
10706
10707 See also the function `find-operation-coding-system'
10708 and the variable `auto-coding-alist'.  */);
10709   Vfile_coding_system_alist = Qnil;
10710
10711   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
10712                doc: /*
10713 Alist to decide a coding system to use for a process I/O operation.
10714 The format is ((PATTERN . VAL) ...),
10715 where PATTERN is a regular expression matching a program name,
10716 VAL is a coding system, a cons of coding systems, or a function symbol.
10717 If VAL is a coding system, it is used for both decoding what received
10718 from the program and encoding what sent to the program.
10719 If VAL is a cons of coding systems, the car part is used for decoding,
10720 and the cdr part is used for encoding.
10721 If VAL is a function symbol, the function must return a coding system
10722 or a cons of coding systems which are used as above.
10723
10724 See also the function `find-operation-coding-system'.  */);
10725   Vprocess_coding_system_alist = Qnil;
10726
10727   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
10728                doc: /*
10729 Alist to decide a coding system to use for a network I/O operation.
10730 The format is ((PATTERN . VAL) ...),
10731 where PATTERN is a regular expression matching a network service name
10732 or is a port number to connect to,
10733 VAL is a coding system, a cons of coding systems, or a function symbol.
10734 If VAL is a coding system, it is used for both decoding what received
10735 from the network stream and encoding what sent to the network stream.
10736 If VAL is a cons of coding systems, the car part is used for decoding,
10737 and the cdr part is used for encoding.
10738 If VAL is a function symbol, the function must return a coding system
10739 or a cons of coding systems which are used as above.
10740
10741 See also the function `find-operation-coding-system'.  */);
10742   Vnetwork_coding_system_alist = Qnil;
10743
10744   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
10745                doc: /* Coding system to use with system messages.
10746 Also used for decoding keyboard input on X Window system.  */);
10747   Vlocale_coding_system = Qnil;
10748
10749   /* The eol mnemonics are reset in startup.el system-dependently.  */
10750   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
10751                doc: /*
10752 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
10753   eol_mnemonic_unix = build_string (":");
10754
10755   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
10756                doc: /*
10757 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
10758   eol_mnemonic_dos = build_string ("\\");
10759
10760   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
10761                doc: /*
10762 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
10763   eol_mnemonic_mac = build_string ("/");
10764
10765   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
10766                doc: /*
10767 *String displayed in mode line when end-of-line format is not yet determined.  */);
10768   eol_mnemonic_undecided = build_string (":");
10769
10770   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
10771                doc: /*
10772 *Non-nil enables character translation while encoding and decoding.  */);
10773   Venable_character_translation = Qt;
10774
10775   DEFVAR_LISP ("standard-translation-table-for-decode",
10776                &Vstandard_translation_table_for_decode,
10777                doc: /* Table for translating characters while decoding.  */);
10778   Vstandard_translation_table_for_decode = Qnil;
10779
10780   DEFVAR_LISP ("standard-translation-table-for-encode",
10781                &Vstandard_translation_table_for_encode,
10782                doc: /* Table for translating characters while encoding.  */);
10783   Vstandard_translation_table_for_encode = Qnil;
10784
10785   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
10786                doc: /* Alist of charsets vs revision numbers.
10787 While encoding, if a charset (car part of an element) is found,
10788 designate it with the escape sequence identifying revision (cdr part
10789 of the element).  */);
10790   Vcharset_revision_table = Qnil;
10791
10792   DEFVAR_LISP ("default-process-coding-system",
10793                &Vdefault_process_coding_system,
10794                doc: /* Cons of coding systems used for process I/O by default.
10795 The car part is used for decoding a process output,
10796 the cdr part is used for encoding a text to be sent to a process.  */);
10797   Vdefault_process_coding_system = Qnil;
10798
10799   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
10800                doc: /*
10801 Table of extra Latin codes in the range 128..159 (inclusive).
10802 This is a vector of length 256.
10803 If Nth element is non-nil, the existence of code N in a file
10804 \(or output of subprocess) doesn't prevent it to be detected as
10805 a coding system of ISO 2022 variant which has a flag
10806 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
10807 or reading output of a subprocess.
10808 Only 128th through 159th elements have a meaning.  */);
10809   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
10810
10811   DEFVAR_LISP ("select-safe-coding-system-function",
10812                &Vselect_safe_coding_system_function,
10813                doc: /*
10814 Function to call to select safe coding system for encoding a text.
10815
10816 If set, this function is called to force a user to select a proper
10817 coding system which can encode the text in the case that a default
10818 coding system used in each operation can't encode the text.  The
10819 function should take care that the buffer is not modified while
10820 the coding system is being selected.
10821
10822 The default value is `select-safe-coding-system' (which see).  */);
10823   Vselect_safe_coding_system_function = Qnil;
10824
10825   DEFVAR_BOOL ("coding-system-require-warning",
10826                &coding_system_require_warning,
10827                doc: /* Internal use only.
10828 If non-nil, on writing a file, `select-safe-coding-system-function' is
10829 called even if `coding-system-for-write' is non-nil.  The command
10830 `universal-coding-system-argument' binds this variable to t temporarily.  */);
10831   coding_system_require_warning = 0;
10832
10833
10834   DEFVAR_BOOL ("inhibit-iso-escape-detection",
10835                &inhibit_iso_escape_detection,
10836                doc: /*
10837 If non-nil, Emacs ignores ISO-2022 escape sequences during code detection.
10838
10839 When Emacs reads text, it tries to detect how the text is encoded.
10840 This code detection is sensitive to escape sequences.  If Emacs sees
10841 a valid ISO-2022 escape sequence, it assumes the text is encoded in one
10842 of the ISO2022 encodings, and decodes text by the corresponding coding
10843 system (e.g. `iso-2022-7bit').
10844
10845 However, there may be a case that you want to read escape sequences in
10846 a file as is.  In such a case, you can set this variable to non-nil.
10847 Then the code detection will ignore any escape sequences, and no text is
10848 detected as encoded in some ISO-2022 encoding.  The result is that all
10849 escape sequences become visible in a buffer.
10850
10851 The default value is nil, and it is strongly recommended not to change
10852 it.  That is because many Emacs Lisp source files that contain
10853 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
10854 in Emacs's distribution, and they won't be decoded correctly on
10855 reading if you suppress escape sequence detection.
10856
10857 The other way to read escape sequences in a file without decoding is
10858 to explicitly specify some coding system that doesn't use ISO-2022
10859 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
10860   inhibit_iso_escape_detection = 0;
10861
10862   DEFVAR_BOOL ("inhibit-null-byte-detection",
10863                &inhibit_null_byte_detection,
10864                doc: /* If non-nil, Emacs ignores null bytes on code detection.
10865 By default, Emacs treats it as binary data, and does not attempt to
10866 decode it.  The effect is as if you specified `no-conversion' for
10867 reading that text.
10868
10869 Set this to non-nil when a regular text happens to include null bytes.
10870 Examples are Index nodes of Info files and null-byte delimited output
10871 from GNU Find and GNU Grep.  Emacs will then ignore the null bytes and
10872 decode text as usual.  */);
10873   inhibit_null_byte_detection = 0;
10874
10875   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
10876                doc: /* Char table for translating self-inserting characters.
10877 This is applied to the result of input methods, not their input.
10878 See also `keyboard-translate-table'.
10879
10880 Use of this variable for character code unification was rendered
10881 obsolete in Emacs 23.1 and later, since Unicode is now the basis of
10882 internal character representation.  */);
10883     Vtranslation_table_for_input = Qnil;
10884
10885   {
10886     Lisp_Object args[coding_arg_max];
10887     Lisp_Object plist[16];
10888     int i;
10889
10890     for (i = 0; i < coding_arg_max; i++)
10891       args[i] = Qnil;
10892
10893     plist[0] = intern (":name");
10894     plist[1] = args[coding_arg_name] = Qno_conversion;
10895     plist[2] = intern (":mnemonic");
10896     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
10897     plist[4] = intern (":coding-type");
10898     plist[5] = args[coding_arg_coding_type] = Qraw_text;
10899     plist[6] = intern (":ascii-compatible-p");
10900     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
10901     plist[8] = intern (":default-char");
10902     plist[9] = args[coding_arg_default_char] = make_number (0);
10903     plist[10] = intern (":for-unibyte");
10904     plist[11] = args[coding_arg_for_unibyte] = Qt;
10905     plist[12] = intern (":docstring");
10906     plist[13] = build_string ("Do no conversion.\n\
10907 \n\
10908 When you visit a file with this coding, the file is read into a\n\
10909 unibyte buffer as is, thus each byte of a file is treated as a\n\
10910 character.");
10911     plist[14] = intern (":eol-type");
10912     plist[15] = args[coding_arg_eol_type] = Qunix;
10913     args[coding_arg_plist] = Flist (16, plist);
10914     Fdefine_coding_system_internal (coding_arg_max, args);
10915
10916     plist[1] = args[coding_arg_name] = Qundecided;
10917     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
10918     plist[5] = args[coding_arg_coding_type] = Qundecided;
10919     /* This is already set.
10920        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
10921     plist[8] = intern (":charset-list");
10922     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
10923     plist[11] = args[coding_arg_for_unibyte] = Qnil;
10924     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
10925     plist[15] = args[coding_arg_eol_type] = Qnil;
10926     args[coding_arg_plist] = Flist (16, plist);
10927     Fdefine_coding_system_internal (coding_arg_max, args);
10928   }
10929
10930   setup_coding_system (Qno_conversion, &safe_terminal_coding);
10931
10932   {
10933     int i;
10934
10935     for (i = 0; i < coding_category_max; i++)
10936       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
10937   }
10938 #if defined (MSDOS) || defined (WINDOWSNT)
10939   system_eol_type = Qdos;
10940 #else
10941   system_eol_type = Qunix;
10942 #endif
10943   staticpro (&system_eol_type);
10944 }
10945
10946 char *
10947 emacs_strerror (error_number)
10948      int error_number;
10949 {
10950   char *str;
10951
10952   synchronize_system_messages_locale ();
10953   str = strerror (error_number);
10954
10955   if (! NILP (Vlocale_coding_system))
10956     {
10957       Lisp_Object dec = code_convert_string_norecord (build_string (str),
10958                                                       Vlocale_coding_system,
10959                                                       0);
10960       str = (char *) SDATA (dec);
10961     }
10962
10963   return str;
10964 }
10965
10966 #endif /* emacs */
10967
10968 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
10969    (do not change this comment) */