src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 /* Format of end-of-line decided by system.  This is Qunix on
 344    Unix and Mac, Qdos on DOS/Windows.
 345    This has an effect only for external encoding (i.e. for output to
 346    file and process), not for in-buffer or Lisp string encoding.  */
 347 static Lisp_Object system_eol_type;
 348
 349 #ifdef emacs
 350
 351 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 352
 353 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 354
 355 /* Coding system emacs-mule and raw-text are for converting only
 356    end-of-line format.  */
 357 Lisp_Object Qemacs_mule, Qraw_text;
 358 Lisp_Object Qutf_8_emacs;
 359
 360 /* Coding-systems are handed between Emacs Lisp programs and C internal
 361    routines by the following three variables.  */
 362 /* Coding-system for reading files and receiving data from process.  */
 363 Lisp_Object Vcoding_system_for_read;
 364 /* Coding-system for writing files and sending data to process.  */
 365 Lisp_Object Vcoding_system_for_write;
 366 /* Coding-system actually used in the latest I/O.  */
 367 Lisp_Object Vlast_coding_system_used;
 368 /* Set to non-nil when an error is detected while code conversion.  */
 369 Lisp_Object Vlast_code_conversion_error;
 370 /* A vector of length 256 which contains information about special
 371    Latin codes (especially for dealing with Microsoft codes).  */
 372 Lisp_Object Vlatin_extra_code_table;
 373
 374 /* Flag to inhibit code conversion of end-of-line format.  */
 375 int inhibit_eol_conversion;
 376
 377 /* Flag to inhibit ISO2022 escape sequence detection.  */
 378 int inhibit_iso_escape_detection;
 379
 380 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 381 int inherit_process_coding_system;
 382
 383 /* Coding system to be used to encode text for terminal display.  */
 384 struct coding_system terminal_coding;
 385
 386 /* Coding system to be used to encode text for terminal display when
 387    terminal coding system is nil.  */
 388 struct coding_system safe_terminal_coding;
 389
 390 /* Coding system of what is sent from terminal keyboard.  */
 391 struct coding_system keyboard_coding;
 392
 393 Lisp_Object Vfile_coding_system_alist;
 394 Lisp_Object Vprocess_coding_system_alist;
 395 Lisp_Object Vnetwork_coding_system_alist;
 396
 397 Lisp_Object Vlocale_coding_system;
 398
 399 #endif /* emacs */
 400
 401 /* Flag to tell if we look up translation table on character code
 402    conversion.  */
 403 Lisp_Object Venable_character_translation;
 404 /* Standard translation table to look up on decoding (reading).  */
 405 Lisp_Object Vstandard_translation_table_for_decode;
 406 /* Standard translation table to look up on encoding (writing).  */
 407 Lisp_Object Vstandard_translation_table_for_encode;
 408
 409 Lisp_Object Qtranslation_table;
 410 Lisp_Object Qtranslation_table_id;
 411 Lisp_Object Qtranslation_table_for_decode;
 412 Lisp_Object Qtranslation_table_for_encode;
 413
 414 /* Alist of charsets vs revision number.  */
 415 static Lisp_Object Vcharset_revision_table;
 416
 417 /* Default coding systems used for process I/O.  */
 418 Lisp_Object Vdefault_process_coding_system;
 419
 420 /* Char table for translating Quail and self-inserting input.  */
 421 Lisp_Object Vtranslation_table_for_input;
 422
 423 /* Two special coding systems.  */
 424 Lisp_Object Vsjis_coding_system;
 425 Lisp_Object Vbig5_coding_system;
 426
 427 /* ISO2022 section */
 428
 429 #define CODING_ISO_INITIAL(coding, reg)                 \
 430   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 431                      coding_attr_iso_initial),          \
 432                reg)))
 433
 434
 435 #define CODING_ISO_REQUEST(coding, charset_id)  \
 436   ((charset_id <= (coding)->max_charset_id      \
 437     ? (coding)->safe_charsets[charset_id]       \
 438     : -1))
 439
 440
 441 #define CODING_ISO_FLAGS(coding)        \
 442   ((coding)->spec.iso_2022.flags)
 443 #define CODING_ISO_DESIGNATION(coding, reg)     \
 444   ((coding)->spec.iso_2022.current_designation[reg])
 445 #define CODING_ISO_INVOCATION(coding, plane)    \
 446   ((coding)->spec.iso_2022.current_invocation[plane])
 447 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 448   ((coding)->spec.iso_2022.single_shifting)
 449 #define CODING_ISO_BOL(coding)  \
 450   ((coding)->spec.iso_2022.bol)
 451 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 452   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 453
 454 /* Control characters of ISO2022.  */
 455                         /* code */      /* function */
 456 #define ISO_CODE_LF     0x0A            /* line-feed */
 457 #define ISO_CODE_CR     0x0D            /* carriage-return */
 458 #define ISO_CODE_SO     0x0E            /* shift-out */
 459 #define ISO_CODE_SI     0x0F            /* shift-in */
 460 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 461 #define ISO_CODE_ESC    0x1B            /* escape */
 462 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 463 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 464 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 465
 466 /* All code (1-byte) of ISO2022 is classified into one of the
 467    followings.  */
 468 enum iso_code_class_type
 469   {
 470     ISO_control_0,              /* Control codes in the range
 471                                    0x00..0x1F and 0x7F, except for the
 472                                    following 5 codes.  */
 473     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 474     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 475     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 476     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 477     ISO_control_1,              /* Control codes in the range
 478                                    0x80..0x9F, except for the
 479                                    following 3 codes.  */
 480     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 481     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 482     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 483     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 484     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 485     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 486     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 487   };
 488
 489 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 490     `iso-flags' attribute of an iso2022 coding system.  */
 491
 492 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 493    instead of the correct short-form sequence (e.g. ESC $ A).  */
 494 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 495
 496 /* If set, reset graphic planes and registers at end-of-line to the
 497    initial state.  */
 498 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 499
 500 /* If set, reset graphic planes and registers before any control
 501    characters to the initial state.  */
 502 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 503
 504 /* If set, encode by 7-bit environment.  */
 505 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 506
 507 /* If set, use locking-shift function.  */
 508 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 509
 510 /* If set, use single-shift function.  Overwrite
 511    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 512 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 513
 514 /* If set, use designation escape sequence.  */
 515 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 516
 517 /* If set, produce revision number sequence.  */
 518 #define CODING_ISO_FLAG_REVISION        0x0080
 519
 520 /* If set, produce ISO6429's direction specifying sequence.  */
 521 #define CODING_ISO_FLAG_DIRECTION       0x0100
 522
 523 /* If set, assume designation states are reset at beginning of line on
 524    output.  */
 525 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 526
 527 /* If set, designation sequence should be placed at beginning of line
 528    on output.  */
 529 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 530
 531 /* If set, do not encode unsafe charactes on output.  */
 532 #define CODING_ISO_FLAG_SAFE            0x0800
 533
 534 /* If set, extra latin codes (128..159) are accepted as a valid code
 535    on input.  */
 536 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 537
 538 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 539
 540 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 541
 542 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 543
 544 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 545
 546 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 547
 548 /* A character to be produced on output if encoding of the original
 549    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 550 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 551
 552
 553 /* UTF-16 section */
 554 #define CODING_UTF_16_BOM(coding)       \
 555   ((coding)->spec.utf_16.bom)
 556
 557 #define CODING_UTF_16_ENDIAN(coding)    \
 558   ((coding)->spec.utf_16.endian)
 559
 560 #define CODING_UTF_16_SURROGATE(coding) \
 561   ((coding)->spec.utf_16.surrogate)
 562
 563
 564 /* CCL section */
 565 #define CODING_CCL_DECODER(coding)      \
 566   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 567 #define CODING_CCL_ENCODER(coding)      \
 568   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 569 #define CODING_CCL_VALIDS(coding)                                          \
 570   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 571
 572 /* Index for each coding category in `coding_categories' */
 573
 574 enum coding_category
 575   {
 576     coding_category_iso_7,
 577     coding_category_iso_7_tight,
 578     coding_category_iso_8_1,
 579     coding_category_iso_8_2,
 580     coding_category_iso_7_else,
 581     coding_category_iso_8_else,
 582     coding_category_utf_8,
 583     coding_category_utf_16_auto,
 584     coding_category_utf_16_be,
 585     coding_category_utf_16_le,
 586     coding_category_utf_16_be_nosig,
 587     coding_category_utf_16_le_nosig,
 588     coding_category_charset,
 589     coding_category_sjis,
 590     coding_category_big5,
 591     coding_category_ccl,
 592     coding_category_emacs_mule,
 593     /* All above are targets of code detection.  */
 594     coding_category_raw_text,
 595     coding_category_undecided,
 596     coding_category_max
 597   };
 598
 599 /* Definitions of flag bits used in detect_coding_XXXX.  */
 600 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 601 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 602 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 603 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 604 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 605 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 606 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 607 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 608 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 609 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 610 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 611 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 612 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 613 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 614 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 615 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 616 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 617 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 618
 619 /* This value is returned if detect_coding_mask () find nothing other
 620    than ASCII characters.  */
 621 #define CATEGORY_MASK_ANY               \
 622   (CATEGORY_MASK_ISO_7                  \
 623    | CATEGORY_MASK_ISO_7_TIGHT          \
 624    | CATEGORY_MASK_ISO_8_1              \
 625    | CATEGORY_MASK_ISO_8_2              \
 626    | CATEGORY_MASK_ISO_7_ELSE           \
 627    | CATEGORY_MASK_ISO_8_ELSE           \
 628    | CATEGORY_MASK_UTF_8                \
 629    | CATEGORY_MASK_UTF_16_BE            \
 630    | CATEGORY_MASK_UTF_16_LE            \
 631    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 632    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 633    | CATEGORY_MASK_CHARSET              \
 634    | CATEGORY_MASK_SJIS                 \
 635    | CATEGORY_MASK_BIG5                 \
 636    | CATEGORY_MASK_CCL                  \
 637    | CATEGORY_MASK_EMACS_MULE)
 638
 639
 640 #define CATEGORY_MASK_ISO_7BIT \
 641   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 642
 643 #define CATEGORY_MASK_ISO_8BIT \
 644   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 645
 646 #define CATEGORY_MASK_ISO_ELSE \
 647   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO_ESCAPE        \
 650   (CATEGORY_MASK_ISO_7                  \
 651    | CATEGORY_MASK_ISO_7_TIGHT          \
 652    | CATEGORY_MASK_ISO_7_ELSE           \
 653    | CATEGORY_MASK_ISO_8_ELSE)
 654
 655 #define CATEGORY_MASK_ISO       \
 656   (  CATEGORY_MASK_ISO_7BIT     \
 657      | CATEGORY_MASK_ISO_8BIT   \
 658      | CATEGORY_MASK_ISO_ELSE)
 659
 660 #define CATEGORY_MASK_UTF_16            \
 661   (CATEGORY_MASK_UTF_16_BE              \
 662    | CATEGORY_MASK_UTF_16_LE            \
 663    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 664    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 665
 666
 667 /* List of symbols `coding-category-xxx' ordered by priority.  This
 668    variable is exposed to Emacs Lisp.  */
 669 static Lisp_Object Vcoding_category_list;
 670
 671 /* Table of coding categories (Lisp symbols).  This variable is for
 672    internal use oly.  */
 673 static Lisp_Object Vcoding_category_table;
 674
 675 /* Table of coding-categories ordered by priority.  */
 676 static enum coding_category coding_priorities[coding_category_max];
 677
 678 /* Nth element is a coding context for the coding system bound to the
 679    Nth coding category.  */
 680 static struct coding_system coding_categories[coding_category_max];
 681
 682 /*** Commonly used macros and functions ***/
 683
 684 #ifndef min
 685 #define min(a, b) ((a) < (b) ? (a) : (b))
 686 #endif
 687 #ifndef max
 688 #define max(a, b) ((a) > (b) ? (a) : (b))
 689 #endif
 690
 691 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 692   do {                                                  \
 693     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 694     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 695   } while (0)
 696
 697
 698 /* Safely get one byte from the source text pointed by SRC which ends
 699    at SRC_END, and set C to that byte.  If there are not enough bytes
 700    in the source, it jumps to `no_more_source'.  If multibytep is
 701    nonzero, and a multibyte character is found at SRC, set C to the
 702    negative value of the character code.  The caller should declare
 703    and set these variables appropriately in advance:
 704         src, src_end, multibytep */
 705
 706 #define ONE_MORE_BYTE(c)                                \
 707   do {                                                  \
 708     if (src == src_end)                                 \
 709       {                                                 \
 710         if (src_base < src)                             \
 711           record_conversion_result                      \
 712             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 713         goto no_more_source;                            \
 714       }                                                 \
 715     c = *src++;                                         \
 716     if (multibytep && (c & 0x80))                       \
 717       {                                                 \
 718         if ((c & 0xFE) == 0xC0)                         \
 719           c = ((c & 1) << 6) | *src++;                  \
 720         else                                            \
 721           {                                             \
 722             src--;                                      \
 723             c = - string_char (src, &src, NULL);        \
 724             record_conversion_result                    \
 725               (coding, CODING_RESULT_INVALID_SRC);      \
 726           }                                             \
 727       }                                                 \
 728     consumed_chars++;                                   \
 729   } while (0)
 730
 731
 732 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 733   do {                                                  \
 734     c = *src++;                                         \
 735     if (multibytep && (c & 0x80))                       \
 736       {                                                 \
 737         if ((c & 0xFE) == 0xC0)                         \
 738           c = ((c & 1) << 6) | *src++;                  \
 739         else                                            \
 740           {                                             \
 741             src--;                                      \
 742             c = - string_char (src, &src, NULL);        \
 743             record_conversion_result                    \
 744               (coding, CODING_RESULT_INVALID_SRC);      \
 745           }                                             \
 746       }                                                 \
 747     consumed_chars++;                                   \
 748   } while (0)
 749
 750
 751 /* Store a byte C in the place pointed by DST and increment DST to the
 752    next free point, and increment PRODUCED_CHARS.  The caller should
 753    assure that C is 0..127, and declare and set the variable `dst'
 754    appropriately in advance.
 755 */
 756
 757
 758 #define EMIT_ONE_ASCII_BYTE(c)  \
 759   do {                          \
 760     produced_chars++;           \
 761     *dst++ = (c);               \
 762   } while (0)
 763
 764
 765 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 766
 767 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 768   do {                                  \
 769     produced_chars += 2;                \
 770     *dst++ = (c1), *dst++ = (c2);       \
 771   } while (0)
 772
 773
 774 /* Store a byte C in the place pointed by DST and increment DST to the
 775    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 776    nonzero, store in an appropriate multibyte from.  The caller should
 777    declare and set the variables `dst' and `multibytep' appropriately
 778    in advance.  */
 779
 780 #define EMIT_ONE_BYTE(c)                \
 781   do {                                  \
 782     produced_chars++;                   \
 783     if (multibytep)                     \
 784       {                                 \
 785         int ch = (c);                   \
 786         if (ch >= 0x80)                 \
 787           ch = BYTE8_TO_CHAR (ch);      \
 788         CHAR_STRING_ADVANCE (ch, dst);  \
 789       }                                 \
 790     else                                \
 791       *dst++ = (c);                     \
 792   } while (0)
 793
 794
 795 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 796
 797 #define EMIT_TWO_BYTES(c1, c2)          \
 798   do {                                  \
 799     produced_chars += 2;                \
 800     if (multibytep)                     \
 801       {                                 \
 802         int ch;                         \
 803                                         \
 804         ch = (c1);                      \
 805         if (ch >= 0x80)                 \
 806           ch = BYTE8_TO_CHAR (ch);      \
 807         CHAR_STRING_ADVANCE (ch, dst);  \
 808         ch = (c2);                      \
 809         if (ch >= 0x80)                 \
 810           ch = BYTE8_TO_CHAR (ch);      \
 811         CHAR_STRING_ADVANCE (ch, dst);  \
 812       }                                 \
 813     else                                \
 814       {                                 \
 815         *dst++ = (c1);                  \
 816         *dst++ = (c2);                  \
 817       }                                 \
 818   } while (0)
 819
 820
 821 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 822   do {                                  \
 823     EMIT_ONE_BYTE (c1);                 \
 824     EMIT_TWO_BYTES (c2, c3);            \
 825   } while (0)
 826
 827
 828 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 829   do {                                          \
 830     EMIT_TWO_BYTES (c1, c2);                    \
 831     EMIT_TWO_BYTES (c3, c4);                    \
 832   } while (0)
 833
 834
 835 /* Prototypes for static functions.  */
 836 static void record_conversion_result P_ ((struct coding_system *coding,
 837                                           enum coding_result_code result));
 838 static int detect_coding_utf_8 P_ ((struct coding_system *,
 839                                     struct coding_detection_info *info));
 840 static void decode_coding_utf_8 P_ ((struct coding_system *));
 841 static int encode_coding_utf_8 P_ ((struct coding_system *));
 842
 843 static int detect_coding_utf_16 P_ ((struct coding_system *,
 844                                      struct coding_detection_info *info));
 845 static void decode_coding_utf_16 P_ ((struct coding_system *));
 846 static int encode_coding_utf_16 P_ ((struct coding_system *));
 847
 848 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 849                                        struct coding_detection_info *info));
 850 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 851 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 852
 853 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 854                                          struct coding_detection_info *info));
 855 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 856 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 857
 858 static int detect_coding_sjis P_ ((struct coding_system *,
 859                                    struct coding_detection_info *info));
 860 static void decode_coding_sjis P_ ((struct coding_system *));
 861 static int encode_coding_sjis P_ ((struct coding_system *));
 862
 863 static int detect_coding_big5 P_ ((struct coding_system *,
 864                                    struct coding_detection_info *info));
 865 static void decode_coding_big5 P_ ((struct coding_system *));
 866 static int encode_coding_big5 P_ ((struct coding_system *));
 867
 868 static int detect_coding_ccl P_ ((struct coding_system *,
 869                                   struct coding_detection_info *info));
 870 static void decode_coding_ccl P_ ((struct coding_system *));
 871 static int encode_coding_ccl P_ ((struct coding_system *));
 872
 873 static void decode_coding_raw_text P_ ((struct coding_system *));
 874 static int encode_coding_raw_text P_ ((struct coding_system *));
 875
 876 static void coding_set_source P_ ((struct coding_system *));
 877 static void coding_set_destination P_ ((struct coding_system *));
 878 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 879 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 880                                             EMACS_INT));
 881 static unsigned char *alloc_destination P_ ((struct coding_system *,
 882                                              EMACS_INT, unsigned char *));
 883 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 884 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 885                                                      int *, int *,
 886                                                      unsigned char *));
 887 static int detect_eol P_ ((const unsigned char *,
 888                            EMACS_INT, enum coding_category));
 889 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 890 static void decode_eol P_ ((struct coding_system *));
 891 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 892 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 893                                         int, int *, int *));
 894 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 895 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 896                                             EMACS_INT));
 897 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 898                                         EMACS_INT));
 899 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 900 static int decode_coding P_ ((struct coding_system *));
 901 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 902                                                       struct coding_system *,
 903                                                       int *, EMACS_INT *));
 904 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 905                                                   struct coding_system *,
 906                                                   int *, EMACS_INT *));
 907 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 908 static int encode_coding P_ ((struct coding_system *));
 909 static Lisp_Object make_conversion_work_buffer P_ ((int));
 910 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 911 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 912 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 913
 914 static void
 915 record_conversion_result (struct coding_system *coding,
 916                           enum coding_result_code result)
 917 {
 918   coding->result = result;
 919   switch (result)
 920     {
 921     case CODING_RESULT_INSUFFICIENT_SRC:
 922       Vlast_code_conversion_error = Qinsufficient_source;
 923       break;
 924     case CODING_RESULT_INCONSISTENT_EOL:
 925       Vlast_code_conversion_error = Qinconsistent_eol;
 926       break;
 927     case CODING_RESULT_INVALID_SRC:
 928       Vlast_code_conversion_error = Qinvalid_source;
 929       break;
 930     case CODING_RESULT_INTERRUPT:
 931       Vlast_code_conversion_error = Qinterrupted;
 932       break;
 933     case CODING_RESULT_INSUFFICIENT_MEM:
 934       Vlast_code_conversion_error = Qinsufficient_memory;
 935       break;
 936     default:
 937       Vlast_code_conversion_error = intern ("Unknown error");
 938     }
 939 }
 940
 941 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 942   do {                                                                       \
 943     charset_map_loaded = 0;                                                  \
 944     c = DECODE_CHAR (charset, code);                                         \
 945     if (charset_map_loaded)                                                  \
 946       {                                                                      \
 947         const unsigned char *orig = coding->source;                          \
 948         EMACS_INT offset;                                                    \
 949                                                                              \
 950         coding_set_source (coding);                                          \
 951         offset = coding->source - orig;                                      \
 952         src += offset;                                                       \
 953         src_base += offset;                                                  \
 954         src_end += offset;                                                   \
 955       }                                                                      \
 956   } while (0)
 957
 958
 959 #define ASSURE_DESTINATION(bytes)                               \
 960   do {                                                          \
 961     if (dst + (bytes) >= dst_end)                               \
 962       {                                                         \
 963         int more_bytes = charbuf_end - charbuf + (bytes);       \
 964                                                                 \
 965         dst = alloc_destination (coding, more_bytes, dst);      \
 966         dst_end = coding->destination + coding->dst_bytes;      \
 967       }                                                         \
 968   } while (0)
 969
 970
 971
 972 static void
 973 coding_set_source (coding)
 974      struct coding_system *coding;
 975 {
 976   if (BUFFERP (coding->src_object))
 977     {
 978       struct buffer *buf = XBUFFER (coding->src_object);
 979
 980       if (coding->src_pos < 0)
 981         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 982       else
 983         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 984     }
 985   else if (STRINGP (coding->src_object))
 986     {
 987       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 988     }
 989   else
 990     /* Otherwise, the source is C string and is never relocated
 991        automatically.  Thus we don't have to update anything.  */
 992     ;
 993 }
 994
 995 static void
 996 coding_set_destination (coding)
 997      struct coding_system *coding;
 998 {
 999   if (BUFFERP (coding->dst_object))
1000     {
1001       if (coding->src_pos < 0)
1002         {
1003           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
1004           coding->dst_bytes = (GAP_END_ADDR
1005                                - (coding->src_bytes - coding->consumed)
1006                                - coding->destination);
1007         }
1008       else
1009         {
1010           /* We are sure that coding->dst_pos_byte is before the gap
1011              of the buffer. */
1012           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1013                                  + coding->dst_pos_byte - 1);
1014           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1015                                - coding->destination);
1016         }
1017     }
1018   else
1019     /* Otherwise, the destination is C string and is never relocated
1020        automatically.  Thus we don't have to update anything.  */
1021     ;
1022 }
1023
1024
1025 static void
1026 coding_alloc_by_realloc (coding, bytes)
1027      struct coding_system *coding;
1028      EMACS_INT bytes;
1029 {
1030   coding->destination = (unsigned char *) xrealloc (coding->destination,
1031                                                     coding->dst_bytes + bytes);
1032   coding->dst_bytes += bytes;
1033 }
1034
1035 static void
1036 coding_alloc_by_making_gap (coding, bytes)
1037      struct coding_system *coding;
1038      EMACS_INT bytes;
1039 {
1040   if (BUFFERP (coding->dst_object)
1041       && EQ (coding->src_object, coding->dst_object))
1042     {
1043       EMACS_INT add = coding->src_bytes - coding->consumed;
1044
1045       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1046       make_gap (bytes);
1047       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1048     }
1049   else
1050     {
1051       Lisp_Object this_buffer;
1052
1053       this_buffer = Fcurrent_buffer ();
1054       set_buffer_internal (XBUFFER (coding->dst_object));
1055       make_gap (bytes);
1056       set_buffer_internal (XBUFFER (this_buffer));
1057     }
1058 }
1059
1060
1061 static unsigned char *
1062 alloc_destination (coding, nbytes, dst)
1063      struct coding_system *coding;
1064      EMACS_INT nbytes;
1065      unsigned char *dst;
1066 {
1067   EMACS_INT offset = dst - coding->destination;
1068
1069   if (BUFFERP (coding->dst_object))
1070     coding_alloc_by_making_gap (coding, nbytes);
1071   else
1072     coding_alloc_by_realloc (coding, nbytes);
1073   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1074   coding_set_destination (coding);
1075   dst = coding->destination + offset;
1076   return dst;
1077 }
1078
1079 /** Macros for annotations.  */
1080
1081 /* Maximum length of annotation data (sum of annotations for
1082    composition and charset).  */
1083 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1084
1085 /* An annotation data is stored in the array coding->charbuf in this
1086    format:
1087      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1088    LENGTH is the number of elements in the annotation.
1089    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1090    NCHARS is the number of characters in the text annotated.
1091
1092    The format of the following elements depend on ANNOTATION_MASK.
1093
1094    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1095    follows:
1096      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1097    METHOD is one of enum composition_method.
1098    Optionnal COMPOSITION-COMPONENTS are characters and composition
1099    rules.
1100
1101    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1102    follows.  */
1103
1104 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1105   do {                                                  \
1106     *(buf)++ = -(len);                                  \
1107     *(buf)++ = (mask);                                  \
1108     *(buf)++ = (nchars);                                \
1109     coding->annotated = 1;                              \
1110   } while (0);
1111
1112 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1113   do {                                                                      \
1114     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1115     *buf++ = method;                                                        \
1116   } while (0)
1117
1118
1119 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1120   do {                                                                  \
1121     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1122     *buf++ = id;                                                        \
1123   } while (0)
1124
1125 \f
1126 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1127
1128
1129
1130 \f
1131 /*** 3. UTF-8 ***/
1132
1133 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1134    Check if a text is encoded in UTF-8.  If it is, return 1, else
1135    return 0.  */
1136
1137 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1138 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1139 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1140 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1141 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1142 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1143
1144 static int
1145 detect_coding_utf_8 (coding, detect_info)
1146      struct coding_system *coding;
1147      struct coding_detection_info *detect_info;
1148 {
1149   const unsigned char *src = coding->source, *src_base;
1150   const unsigned char *src_end = coding->source + coding->src_bytes;
1151   int multibytep = coding->src_multibyte;
1152   int consumed_chars = 0;
1153   int found = 0;
1154
1155   detect_info->checked |= CATEGORY_MASK_UTF_8;
1156   /* A coding system of this category is always ASCII compatible.  */
1157   src += coding->head_ascii;
1158
1159   while (1)
1160     {
1161       int c, c1, c2, c3, c4;
1162
1163       src_base = src;
1164       ONE_MORE_BYTE (c);
1165       if (c < 0 || UTF_8_1_OCTET_P (c))
1166         continue;
1167       ONE_MORE_BYTE (c1);
1168       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1169         break;
1170       if (UTF_8_2_OCTET_LEADING_P (c))
1171         {
1172           found = CATEGORY_MASK_UTF_8;
1173           continue;
1174         }
1175       ONE_MORE_BYTE (c2);
1176       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1177         break;
1178       if (UTF_8_3_OCTET_LEADING_P (c))
1179         {
1180           found = CATEGORY_MASK_UTF_8;
1181           continue;
1182         }
1183       ONE_MORE_BYTE (c3);
1184       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1185         break;
1186       if (UTF_8_4_OCTET_LEADING_P (c))
1187         {
1188           found = CATEGORY_MASK_UTF_8;
1189           continue;
1190         }
1191       ONE_MORE_BYTE (c4);
1192       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1193         break;
1194       if (UTF_8_5_OCTET_LEADING_P (c))
1195         {
1196           found = CATEGORY_MASK_UTF_8;
1197           continue;
1198         }
1199       break;
1200     }
1201   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202   return 0;
1203
1204  no_more_source:
1205   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1206     {
1207       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1208       return 0;
1209     }
1210   detect_info->found |= found;
1211   return 1;
1212 }
1213
1214
1215 static void
1216 decode_coding_utf_8 (coding)
1217      struct coding_system *coding;
1218 {
1219   const unsigned char *src = coding->source + coding->consumed;
1220   const unsigned char *src_end = coding->source + coding->src_bytes;
1221   const unsigned char *src_base;
1222   int *charbuf = coding->charbuf + coding->charbuf_used;
1223   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1224   int consumed_chars = 0, consumed_chars_base;
1225   int multibytep = coding->src_multibyte;
1226   Lisp_Object attr, charset_list;
1227
1228   CODING_GET_INFO (coding, attr, charset_list);
1229
1230   while (1)
1231     {
1232       int c, c1, c2, c3, c4, c5;
1233
1234       src_base = src;
1235       consumed_chars_base = consumed_chars;
1236
1237       if (charbuf >= charbuf_end)
1238         break;
1239
1240       ONE_MORE_BYTE (c1);
1241       if (c1 < 0)
1242         {
1243           c = - c1;
1244         }
1245       else if (UTF_8_1_OCTET_P(c1))
1246         {
1247           c = c1;
1248         }
1249       else
1250         {
1251           ONE_MORE_BYTE (c2);
1252           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1253             goto invalid_code;
1254           if (UTF_8_2_OCTET_LEADING_P (c1))
1255             {
1256               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1257               /* Reject overlong sequences here and below.  Encoders
1258                  producing them are incorrect, they can be misleading,
1259                  and they mess up read/write invariance.  */
1260               if (c < 128)
1261                 goto invalid_code;
1262             }
1263           else
1264             {
1265               ONE_MORE_BYTE (c3);
1266               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1267                 goto invalid_code;
1268               if (UTF_8_3_OCTET_LEADING_P (c1))
1269                 {
1270                   c = (((c1 & 0xF) << 12)
1271                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1272                   if (c < 0x800
1273                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1274                     goto invalid_code;
1275                 }
1276               else
1277                 {
1278                   ONE_MORE_BYTE (c4);
1279                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1280                     goto invalid_code;
1281                   if (UTF_8_4_OCTET_LEADING_P (c1))
1282                     {
1283                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1284                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1285                     if (c < 0x10000)
1286                       goto invalid_code;
1287                     }
1288                   else
1289                     {
1290                       ONE_MORE_BYTE (c5);
1291                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1292                         goto invalid_code;
1293                       if (UTF_8_5_OCTET_LEADING_P (c1))
1294                         {
1295                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1296                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1297                                | (c5 & 0x3F));
1298                           if ((c > MAX_CHAR) || (c < 0x200000))
1299                             goto invalid_code;
1300                         }
1301                       else
1302                         goto invalid_code;
1303                     }
1304                 }
1305             }
1306         }
1307
1308       *charbuf++ = c;
1309       continue;
1310
1311     invalid_code:
1312       src = src_base;
1313       consumed_chars = consumed_chars_base;
1314       ONE_MORE_BYTE (c);
1315       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1316       coding->errors++;
1317     }
1318
1319  no_more_source:
1320   coding->consumed_char += consumed_chars_base;
1321   coding->consumed = src_base - coding->source;
1322   coding->charbuf_used = charbuf - coding->charbuf;
1323 }
1324
1325
1326 static int
1327 encode_coding_utf_8 (coding)
1328      struct coding_system *coding;
1329 {
1330   int multibytep = coding->dst_multibyte;
1331   int *charbuf = coding->charbuf;
1332   int *charbuf_end = charbuf + coding->charbuf_used;
1333   unsigned char *dst = coding->destination + coding->produced;
1334   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1335   int produced_chars = 0;
1336   int c;
1337
1338   if (multibytep)
1339     {
1340       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1341
1342       while (charbuf < charbuf_end)
1343         {
1344           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1345
1346           ASSURE_DESTINATION (safe_room);
1347           c = *charbuf++;
1348           if (CHAR_BYTE8_P (c))
1349             {
1350               c = CHAR_TO_BYTE8 (c);
1351               EMIT_ONE_BYTE (c);
1352             }
1353           else
1354             {
1355               CHAR_STRING_ADVANCE (c, pend);
1356               for (p = str; p < pend; p++)
1357                 EMIT_ONE_BYTE (*p);
1358             }
1359         }
1360     }
1361   else
1362     {
1363       int safe_room = MAX_MULTIBYTE_LENGTH;
1364
1365       while (charbuf < charbuf_end)
1366         {
1367           ASSURE_DESTINATION (safe_room);
1368           c = *charbuf++;
1369           if (CHAR_BYTE8_P (c))
1370             *dst++ = CHAR_TO_BYTE8 (c);
1371           else
1372             dst += CHAR_STRING (c, dst);
1373           produced_chars++;
1374         }
1375     }
1376   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1377   coding->produced_char += produced_chars;
1378   coding->produced = dst - coding->destination;
1379   return 0;
1380 }
1381
1382
1383 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1384    Check if a text is encoded in one of UTF-16 based coding systems.
1385    If it is, return 1, else return 0.  */
1386
1387 #define UTF_16_HIGH_SURROGATE_P(val) \
1388   (((val) & 0xFC00) == 0xD800)
1389
1390 #define UTF_16_LOW_SURROGATE_P(val) \
1391   (((val) & 0xFC00) == 0xDC00)
1392
1393 #define UTF_16_INVALID_P(val)   \
1394   (((val) == 0xFFFE)            \
1395    || ((val) == 0xFFFF)         \
1396    || UTF_16_LOW_SURROGATE_P (val))
1397
1398
1399 static int
1400 detect_coding_utf_16 (coding, detect_info)
1401      struct coding_system *coding;
1402      struct coding_detection_info *detect_info;
1403 {
1404   const unsigned char *src = coding->source, *src_base = src;
1405   const unsigned char *src_end = coding->source + coding->src_bytes;
1406   int multibytep = coding->src_multibyte;
1407   int consumed_chars = 0;
1408   int c1, c2;
1409
1410   detect_info->checked |= CATEGORY_MASK_UTF_16;
1411   if (coding->mode & CODING_MODE_LAST_BLOCK
1412       && (coding->src_chars & 1))
1413     {
1414       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1415       return 0;
1416     }
1417
1418   ONE_MORE_BYTE (c1);
1419   ONE_MORE_BYTE (c2);
1420   if ((c1 == 0xFF) && (c2 == 0xFE))
1421     {
1422       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1423                              | CATEGORY_MASK_UTF_16_AUTO);
1424       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1425                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1426                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1427     }
1428   else if ((c1 == 0xFE) && (c2 == 0xFF))
1429     {
1430       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1431                              | CATEGORY_MASK_UTF_16_AUTO);
1432       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1433                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1434                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1435     }
1436   else if (c1 >= 0 && c2 >= 0)
1437     {
1438       detect_info->rejected
1439         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1440     }
1441  no_more_source:
1442   return 1;
1443 }
1444
1445 static void
1446 decode_coding_utf_16 (coding)
1447      struct coding_system *coding;
1448 {
1449   const unsigned char *src = coding->source + coding->consumed;
1450   const unsigned char *src_end = coding->source + coding->src_bytes;
1451   const unsigned char *src_base;
1452   int *charbuf = coding->charbuf + coding->charbuf_used;
1453   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1454   int consumed_chars = 0, consumed_chars_base;
1455   int multibytep = coding->src_multibyte;
1456   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1457   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1458   int surrogate = CODING_UTF_16_SURROGATE (coding);
1459   Lisp_Object attr, charset_list;
1460
1461   CODING_GET_INFO (coding, attr, charset_list);
1462
1463   if (bom == utf_16_with_bom)
1464     {
1465       int c, c1, c2;
1466
1467       src_base = src;
1468       ONE_MORE_BYTE (c1);
1469       ONE_MORE_BYTE (c2);
1470       c = (c1 << 8) | c2;
1471
1472       if (endian == utf_16_big_endian
1473           ? c != 0xFEFF : c != 0xFFFE)
1474         {
1475           /* The first two bytes are not BOM.  Treat them as bytes
1476              for a normal character.  */
1477           src = src_base;
1478           coding->errors++;
1479         }
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482   else if (bom == utf_16_detect_bom)
1483     {
1484       /* We have already tried to detect BOM and failed in
1485          detect_coding.  */
1486       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1487     }
1488
1489   while (1)
1490     {
1491       int c, c1, c2;
1492
1493       src_base = src;
1494       consumed_chars_base = consumed_chars;
1495
1496       if (charbuf + 2 >= charbuf_end)
1497         break;
1498
1499       ONE_MORE_BYTE (c1);
1500       if (c1 < 0)
1501         {
1502           *charbuf++ = -c1;
1503           continue;
1504         }
1505       ONE_MORE_BYTE (c2);
1506       if (c2 < 0)
1507         {
1508           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1509           *charbuf++ = -c2;
1510           continue;
1511         }
1512       c = (endian == utf_16_big_endian
1513            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1514       if (surrogate)
1515         {
1516           if (! UTF_16_LOW_SURROGATE_P (c))
1517             {
1518               if (endian == utf_16_big_endian)
1519                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1520               else
1521                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1522               *charbuf++ = c1;
1523               *charbuf++ = c2;
1524               coding->errors++;
1525               if (UTF_16_HIGH_SURROGATE_P (c))
1526                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1527               else
1528                 *charbuf++ = c;
1529             }
1530           else
1531             {
1532               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1533               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1534               *charbuf++ = 0x10000 + c;
1535             }
1536         }
1537       else
1538         {
1539           if (UTF_16_HIGH_SURROGATE_P (c))
1540             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1541           else
1542             *charbuf++ = c;
1543         }
1544     }
1545
1546  no_more_source:
1547   coding->consumed_char += consumed_chars_base;
1548   coding->consumed = src_base - coding->source;
1549   coding->charbuf_used = charbuf - coding->charbuf;
1550 }
1551
1552 static int
1553 encode_coding_utf_16 (coding)
1554      struct coding_system *coding;
1555 {
1556   int multibytep = coding->dst_multibyte;
1557   int *charbuf = coding->charbuf;
1558   int *charbuf_end = charbuf + coding->charbuf_used;
1559   unsigned char *dst = coding->destination + coding->produced;
1560   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1561   int safe_room = 8;
1562   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1563   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1564   int produced_chars = 0;
1565   Lisp_Object attrs, charset_list;
1566   int c;
1567
1568   CODING_GET_INFO (coding, attrs, charset_list);
1569
1570   if (bom != utf_16_without_bom)
1571     {
1572       ASSURE_DESTINATION (safe_room);
1573       if (big_endian)
1574         EMIT_TWO_BYTES (0xFE, 0xFF);
1575       else
1576         EMIT_TWO_BYTES (0xFF, 0xFE);
1577       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1578     }
1579
1580   while (charbuf < charbuf_end)
1581     {
1582       ASSURE_DESTINATION (safe_room);
1583       c = *charbuf++;
1584       if (c >= MAX_UNICODE_CHAR)
1585         c = coding->default_char;
1586
1587       if (c < 0x10000)
1588         {
1589           if (big_endian)
1590             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1591           else
1592             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1593         }
1594       else
1595         {
1596           int c1, c2;
1597
1598           c -= 0x10000;
1599           c1 = (c >> 10) + 0xD800;
1600           c2 = (c & 0x3FF) + 0xDC00;
1601           if (big_endian)
1602             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1603           else
1604             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1605         }
1606     }
1607   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1608   coding->produced = dst - coding->destination;
1609   coding->produced_char += produced_chars;
1610   return 0;
1611 }
1612
1613 \f
1614 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1615
1616 /* Emacs' internal format for representation of multiple character
1617    sets is a kind of multi-byte encoding, i.e. characters are
1618    represented by variable-length sequences of one-byte codes.
1619
1620    ASCII characters and control characters (e.g. `tab', `newline') are
1621    represented by one-byte sequences which are their ASCII codes, in
1622    the range 0x00 through 0x7F.
1623
1624    8-bit characters of the range 0x80..0x9F are represented by
1625    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1626    code + 0x20).
1627
1628    8-bit characters of the range 0xA0..0xFF are represented by
1629    one-byte sequences which are their 8-bit code.
1630
1631    The other characters are represented by a sequence of `base
1632    leading-code', optional `extended leading-code', and one or two
1633    `position-code's.  The length of the sequence is determined by the
1634    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1635    whereas extended leading-code and position-code take the range 0xA0
1636    through 0xFF.  See `charset.h' for more details about leading-code
1637    and position-code.
1638
1639    --- CODE RANGE of Emacs' internal format ---
1640    character set        range
1641    -------------        -----
1642    ascii                0x00..0x7F
1643    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1644    eight-bit-graphic    0xA0..0xBF
1645    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1646    ---------------------------------------------
1647
1648    As this is the internal character representation, the format is
1649    usually not used externally (i.e. in a file or in a data sent to a
1650    process).  But, it is possible to have a text externally in this
1651    format (i.e. by encoding by the coding system `emacs-mule').
1652
1653    In that case, a sequence of one-byte codes has a slightly different
1654    form.
1655
1656    At first, all characters in eight-bit-control are represented by
1657    one-byte sequences which are their 8-bit code.
1658
1659    Next, character composition data are represented by the byte
1660    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1661    where,
1662         METHOD is 0xF0 plus one of composition method (enum
1663         composition_method),
1664
1665         BYTES is 0xA0 plus a byte length of this composition data,
1666
1667         CHARS is 0x20 plus a number of characters composed by this
1668         data,
1669
1670         COMPONENTs are characters of multibye form or composition
1671         rules encoded by two-byte of ASCII codes.
1672
1673    In addition, for backward compatibility, the following formats are
1674    also recognized as composition data on decoding.
1675
1676    0x80 MSEQ ...
1677    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1678
1679    Here,
1680         MSEQ is a multibyte form but in these special format:
1681           ASCII: 0xA0 ASCII_CODE+0x80,
1682           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1683         RULE is a one byte code of the range 0xA0..0xF0 that
1684         represents a composition rule.
1685   */
1686
1687 char emacs_mule_bytes[256];
1688
1689 int
1690 emacs_mule_char (coding, src, nbytes, nchars, id)
1691      struct coding_system *coding;
1692      const unsigned char *src;
1693      int *nbytes, *nchars, *id;
1694 {
1695   const unsigned char *src_end = coding->source + coding->src_bytes;
1696   const unsigned char *src_base = src;
1697   int multibytep = coding->src_multibyte;
1698   struct charset *charset;
1699   unsigned code;
1700   int c;
1701   int consumed_chars = 0;
1702
1703   ONE_MORE_BYTE (c);
1704   if (c < 0)
1705     {
1706       c = -c;
1707       charset = emacs_mule_charset[0];
1708     }
1709   else
1710     {
1711       if (c >= 0xA0)
1712         {
1713           /* Old style component character of a compostion.  */
1714           if (c == 0xA0)
1715             {
1716               ONE_MORE_BYTE (c);
1717               c -= 0x80;
1718             }
1719           else
1720             c -= 0x20;
1721         }
1722
1723       switch (emacs_mule_bytes[c])
1724         {
1725         case 2:
1726           if (! (charset = emacs_mule_charset[c]))
1727             goto invalid_code;
1728           ONE_MORE_BYTE (c);
1729           if (c < 0xA0)
1730             goto invalid_code;
1731           code = c & 0x7F;
1732           break;
1733
1734         case 3:
1735           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1736               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1737             {
1738               ONE_MORE_BYTE (c);
1739               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1740                 goto invalid_code;
1741               ONE_MORE_BYTE (c);
1742               if (c < 0xA0)
1743                 goto invalid_code;
1744               code = c & 0x7F;
1745             }
1746           else
1747             {
1748               if (! (charset = emacs_mule_charset[c]))
1749                 goto invalid_code;
1750               ONE_MORE_BYTE (c);
1751               if (c < 0xA0)
1752                 goto invalid_code;
1753               code = (c & 0x7F) << 8;
1754               ONE_MORE_BYTE (c);
1755               if (c < 0xA0)
1756                 goto invalid_code;
1757               code |= c & 0x7F;
1758             }
1759           break;
1760
1761         case 4:
1762           ONE_MORE_BYTE (c);
1763           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1764             goto invalid_code;
1765           ONE_MORE_BYTE (c);
1766           if (c < 0xA0)
1767             goto invalid_code;
1768           code = (c & 0x7F) << 8;
1769           ONE_MORE_BYTE (c);
1770           if (c < 0xA0)
1771             goto invalid_code;
1772           code |= c & 0x7F;
1773           break;
1774
1775         case 1:
1776           code = c;
1777           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1778                                      ? charset_ascii : charset_eight_bit);
1779           break;
1780
1781         default:
1782           abort ();
1783         }
1784       c = DECODE_CHAR (charset, code);
1785       if (c < 0)
1786         goto invalid_code;
1787     }
1788   *nbytes = src - src_base;
1789   *nchars = consumed_chars;
1790   if (id)
1791     *id = charset->id;
1792   return c;
1793
1794  no_more_source:
1795   return -2;
1796
1797  invalid_code:
1798   return -1;
1799 }
1800
1801
1802 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1803    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1804    else return 0.  */
1805
1806 static int
1807 detect_coding_emacs_mule (coding, detect_info)
1808      struct coding_system *coding;
1809      struct coding_detection_info *detect_info;
1810 {
1811   const unsigned char *src = coding->source, *src_base;
1812   const unsigned char *src_end = coding->source + coding->src_bytes;
1813   int multibytep = coding->src_multibyte;
1814   int consumed_chars = 0;
1815   int c;
1816   int found = 0;
1817
1818   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1819   /* A coding system of this category is always ASCII compatible.  */
1820   src += coding->head_ascii;
1821
1822   while (1)
1823     {
1824       src_base = src;
1825       ONE_MORE_BYTE (c);
1826       if (c < 0)
1827         continue;
1828       if (c == 0x80)
1829         {
1830           /* Perhaps the start of composite character.  We simple skip
1831              it because analyzing it is too heavy for detecting.  But,
1832              at least, we check that the composite character
1833              constitues of more than 4 bytes.  */
1834           const unsigned char *src_base;
1835
1836         repeat:
1837           src_base = src;
1838           do
1839             {
1840               ONE_MORE_BYTE (c);
1841             }
1842           while (c >= 0xA0);
1843
1844           if (src - src_base <= 4)
1845             break;
1846           found = CATEGORY_MASK_EMACS_MULE;
1847           if (c == 0x80)
1848             goto repeat;
1849         }
1850
1851       if (c < 0x80)
1852         {
1853           if (c < 0x20
1854               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1855             break;
1856         }
1857       else
1858         {
1859           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1860
1861           while (more_bytes > 0)
1862             {
1863               ONE_MORE_BYTE (c);
1864               if (c < 0xA0)
1865                 {
1866                   src--;        /* Unread the last byte.  */
1867                   break;
1868                 }
1869               more_bytes--;
1870             }
1871           if (more_bytes != 0)
1872             break;
1873           found = CATEGORY_MASK_EMACS_MULE;
1874         }
1875     }
1876   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1877   return 0;
1878
1879  no_more_source:
1880   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1881     {
1882       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1883       return 0;
1884     }
1885   detect_info->found |= found;
1886   return 1;
1887 }
1888
1889
1890 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1891
1892 /* Decode a character represented as a component of composition
1893    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1894    update SRC to the head of next character (or an encoded composition
1895    rule).  If SRC doesn't points a composition component, set C to -1.
1896    If SRC points an invalid byte sequence, global exit by a return
1897    value 0.  */
1898
1899 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1900   if (1)                                                        \
1901     {                                                           \
1902       int c;                                                    \
1903       int nbytes, nchars;                                       \
1904                                                                 \
1905       if (src == src_end)                                       \
1906         break;                                                  \
1907       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1908       if (c < 0)                                                \
1909         {                                                       \
1910           if (c == -2)                                          \
1911             break;                                              \
1912           goto invalid_code;                                    \
1913         }                                                       \
1914       *buf++ = c;                                               \
1915       src += nbytes;                                            \
1916       consumed_chars += nchars;                                 \
1917     }                                                           \
1918   else
1919
1920
1921 /* Decode a composition rule represented as a component of composition
1922    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1923    and increment BUF.  If SRC points an invalid byte sequence, set C
1924    to -1.  */
1925
1926 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1927   do {                                                  \
1928     int c, gref, nref;                                  \
1929                                                         \
1930     if (src >= src_end)                                 \
1931       goto invalid_code;                                \
1932     ONE_MORE_BYTE_NO_CHECK (c);                         \
1933     c -= 0xA0;                                          \
1934     if (c < 0 || c >= 81)                               \
1935       goto invalid_code;                                \
1936                                                         \
1937     gref = c / 9, nref = c % 9;                         \
1938     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1939   } while (0)
1940
1941
1942 /* Decode a composition rule represented as a component of composition
1943    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1944    and increment BUF.  If SRC points an invalid byte sequence, set C
1945    to -1.  */
1946
1947 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1948   do {                                                  \
1949     int gref, nref;                                     \
1950                                                         \
1951     if (src + 1>= src_end)                              \
1952       goto invalid_code;                                \
1953     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1954     gref -= 0x20;                                       \
1955     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1956     nref -= 0x20;                                       \
1957     if (gref < 0 || gref >= 81                          \
1958         || nref < 0 || nref >= 81)                      \
1959       goto invalid_code;                                \
1960     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1961   } while (0)
1962
1963
1964 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1965   do {                                                                  \
1966     /* Emacs 21 style format.  The first three bytes at SRC are         \
1967        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1968        the byte length of this composition information, CHARS is the    \
1969        number of characters composed by this composition.  */           \
1970     enum composition_method method = c - 0xF2;                          \
1971     int *charbuf_base = charbuf;                                        \
1972     int consumed_chars_limit;                                           \
1973     int nbytes, nchars;                                                 \
1974                                                                         \
1975     ONE_MORE_BYTE (c);                                                  \
1976     if (c < 0)                                                          \
1977       goto invalid_code;                                                \
1978     nbytes = c - 0xA0;                                                  \
1979     if (nbytes < 3)                                                     \
1980       goto invalid_code;                                                \
1981     ONE_MORE_BYTE (c);                                                  \
1982     if (c < 0)                                                          \
1983       goto invalid_code;                                                \
1984     nchars = c - 0xA0;                                                  \
1985     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1986     consumed_chars_limit = consumed_chars_base + nbytes;                \
1987     if (method != COMPOSITION_RELATIVE)                                 \
1988       {                                                                 \
1989         int i = 0;                                                      \
1990         while (consumed_chars < consumed_chars_limit)                   \
1991           {                                                             \
1992             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1993               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1994             else                                                        \
1995               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1996             i++;                                                        \
1997           }                                                             \
1998         if (consumed_chars < consumed_chars_limit)                      \
1999           goto invalid_code;                                            \
2000         charbuf_base[0] -= i;                                           \
2001       }                                                                 \
2002   } while (0)
2003
2004
2005 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)                    \
2006   do {                                                                  \
2007     /* Emacs 20 style format for relative composition.  */              \
2008     /* Store multibyte form of characters to be composed.  */           \
2009     enum composition_method method = COMPOSITION_RELATIVE;              \
2010     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];                 \
2011     int *buf = components;                                              \
2012     int i, j;                                                           \
2013                                                                         \
2014     src = src_base;                                                     \
2015     ONE_MORE_BYTE (c);          /* skip 0x80 */                         \
2016     for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++)    \
2017       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                         \
2018     if (i < 2)                                                          \
2019       goto invalid_code;                                                \
2020     ADD_COMPOSITION_DATA (charbuf, i, method);                          \
2021     for (j = 0; j < i; j++)                                             \
2022       *charbuf++ = components[j];                                       \
2023   } while (0)
2024
2025
2026 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2027   do {                                                          \
2028     /* Emacs 20 style format for rule-base composition.  */     \
2029     /* Store multibyte form of characters to be composed.  */   \
2030     enum composition_method method = COMPOSITION_WITH_RULE;     \
2031     int *charbuf_base = charbuf;                                \
2032     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2033     int *buf = components;                                      \
2034     int i, j;                                                   \
2035                                                                 \
2036     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2037     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2038       {                                                         \
2039         if (*src < 0xA0)                                        \
2040           break;                                                \
2041         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2042         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2043       }                                                         \
2044     if (i <= 1 || (buf - components) % 2 == 0)                  \
2045       goto invalid_code;                                        \
2046     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2047       goto no_more_source;                                      \
2048     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2049     i = i * 2 - 1;                                              \
2050     for (j = 0; j < i; j++)                                     \
2051       *charbuf++ = components[j];                               \
2052     charbuf_base[0] -= i;                                       \
2053     for (j = 0; j < i; j += 2)                                  \
2054       *charbuf++ = components[j];                               \
2055   } while (0)
2056
2057
2058 static void
2059 decode_coding_emacs_mule (coding)
2060      struct coding_system *coding;
2061 {
2062   const unsigned char *src = coding->source + coding->consumed;
2063   const unsigned char *src_end = coding->source + coding->src_bytes;
2064   const unsigned char *src_base;
2065   int *charbuf = coding->charbuf + coding->charbuf_used;
2066   int *charbuf_end
2067     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2068   int consumed_chars = 0, consumed_chars_base;
2069   int multibytep = coding->src_multibyte;
2070   Lisp_Object attrs, charset_list;
2071   int char_offset = coding->produced_char;
2072   int last_offset = char_offset;
2073   int last_id = charset_ascii;
2074
2075   CODING_GET_INFO (coding, attrs, charset_list);
2076
2077   while (1)
2078     {
2079       int c;
2080
2081       src_base = src;
2082       consumed_chars_base = consumed_chars;
2083
2084       if (charbuf >= charbuf_end)
2085         break;
2086
2087       ONE_MORE_BYTE (c);
2088       if (c < 0)
2089         {
2090           *charbuf++ = -c;
2091           char_offset++;
2092         }
2093       else if (c < 0x80)
2094         {
2095           *charbuf++ = c;
2096           char_offset++;
2097         }
2098       else if (c == 0x80)
2099         {
2100           ONE_MORE_BYTE (c);
2101           if (c < 0)
2102             goto invalid_code;
2103           if (c - 0xF2 >= COMPOSITION_RELATIVE
2104               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2105             DECODE_EMACS_MULE_21_COMPOSITION (c);
2106           else if (c < 0xC0)
2107             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2108           else if (c == 0xFF)
2109             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2110           else
2111             goto invalid_code;
2112         }
2113       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2114         {
2115           int nbytes, nchars;
2116           int id;
2117
2118           src = src_base;
2119           consumed_chars = consumed_chars_base;
2120           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2121           if (c < 0)
2122             {
2123               if (c == -2)
2124                 break;
2125               goto invalid_code;
2126             }
2127           if (last_id != id)
2128             {
2129               if (last_id != charset_ascii)
2130                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2131               last_id = id;
2132               last_offset = char_offset;
2133             }
2134           *charbuf++ = c;
2135           src += nbytes;
2136           consumed_chars += nchars;
2137           char_offset++;
2138         }
2139       else
2140         goto invalid_code;
2141       continue;
2142
2143     invalid_code:
2144       src = src_base;
2145       consumed_chars = consumed_chars_base;
2146       ONE_MORE_BYTE (c);
2147       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2148       char_offset++;
2149       coding->errors++;
2150     }
2151
2152  no_more_source:
2153   if (last_id != charset_ascii)
2154     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2155   coding->consumed_char += consumed_chars_base;
2156   coding->consumed = src_base - coding->source;
2157   coding->charbuf_used = charbuf - coding->charbuf;
2158 }
2159
2160
2161 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2162   do {                                          \
2163     if (id < 0xA0)                              \
2164       codes[0] = id, codes[1] = 0;              \
2165     else if (id < 0xE0)                         \
2166       codes[0] = 0x9A, codes[1] = id;           \
2167     else if (id < 0xF0)                         \
2168       codes[0] = 0x9B, codes[1] = id;           \
2169     else if (id < 0xF5)                         \
2170       codes[0] = 0x9C, codes[1] = id;           \
2171     else                                        \
2172       codes[0] = 0x9D, codes[1] = id;           \
2173   } while (0);
2174
2175
2176 static int
2177 encode_coding_emacs_mule (coding)
2178      struct coding_system *coding;
2179 {
2180   int multibytep = coding->dst_multibyte;
2181   int *charbuf = coding->charbuf;
2182   int *charbuf_end = charbuf + coding->charbuf_used;
2183   unsigned char *dst = coding->destination + coding->produced;
2184   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2185   int safe_room = 8;
2186   int produced_chars = 0;
2187   Lisp_Object attrs, charset_list;
2188   int c;
2189   int preferred_charset_id = -1;
2190
2191   CODING_GET_INFO (coding, attrs, charset_list);
2192   if (! EQ (charset_list, Vemacs_mule_charset_list))
2193     {
2194       CODING_ATTR_CHARSET_LIST (attrs)
2195         = charset_list = Vemacs_mule_charset_list;
2196     }
2197
2198   while (charbuf < charbuf_end)
2199     {
2200       ASSURE_DESTINATION (safe_room);
2201       c = *charbuf++;
2202
2203       if (c < 0)
2204         {
2205           /* Handle an annotation.  */
2206           switch (*charbuf)
2207             {
2208             case CODING_ANNOTATE_COMPOSITION_MASK:
2209               /* Not yet implemented.  */
2210               break;
2211             case CODING_ANNOTATE_CHARSET_MASK:
2212               preferred_charset_id = charbuf[3];
2213               if (preferred_charset_id >= 0
2214                   && NILP (Fmemq (make_number (preferred_charset_id),
2215                                   charset_list)))
2216                 preferred_charset_id = -1;
2217               break;
2218             default:
2219               abort ();
2220             }
2221           charbuf += -c - 1;
2222           continue;
2223         }
2224
2225       if (ASCII_CHAR_P (c))
2226         EMIT_ONE_ASCII_BYTE (c);
2227       else if (CHAR_BYTE8_P (c))
2228         {
2229           c = CHAR_TO_BYTE8 (c);
2230           EMIT_ONE_BYTE (c);
2231         }
2232       else
2233         {
2234           struct charset *charset;
2235           unsigned code;
2236           int dimension;
2237           int emacs_mule_id;
2238           unsigned char leading_codes[2];
2239
2240           if (preferred_charset_id >= 0)
2241             {
2242               charset = CHARSET_FROM_ID (preferred_charset_id);
2243               if (! CHAR_CHARSET_P (c, charset))
2244                 charset = char_charset (c, charset_list, NULL);
2245             }
2246           else
2247             charset = char_charset (c, charset_list, &code);
2248           if (! charset)
2249             {
2250               c = coding->default_char;
2251               if (ASCII_CHAR_P (c))
2252                 {
2253                   EMIT_ONE_ASCII_BYTE (c);
2254                   continue;
2255                 }
2256               charset = char_charset (c, charset_list, &code);
2257             }
2258           dimension = CHARSET_DIMENSION (charset);
2259           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2260           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2261           EMIT_ONE_BYTE (leading_codes[0]);
2262           if (leading_codes[1])
2263             EMIT_ONE_BYTE (leading_codes[1]);
2264           if (dimension == 1)
2265             EMIT_ONE_BYTE (code | 0x80);
2266           else
2267             {
2268               code |= 0x8080;
2269               EMIT_ONE_BYTE (code >> 8);
2270               EMIT_ONE_BYTE (code & 0xFF);
2271             }
2272         }
2273     }
2274   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2275   coding->produced_char += produced_chars;
2276   coding->produced = dst - coding->destination;
2277   return 0;
2278 }
2279
2280 \f
2281 /*** 7. ISO2022 handlers ***/
2282
2283 /* The following note describes the coding system ISO2022 briefly.
2284    Since the intention of this note is to help understand the
2285    functions in this file, some parts are NOT ACCURATE or are OVERLY
2286    SIMPLIFIED.  For thorough understanding, please refer to the
2287    original document of ISO2022.  This is equivalent to the standard
2288    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2289
2290    ISO2022 provides many mechanisms to encode several character sets
2291    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2292    is encoded using bytes less than 128.  This may make the encoded
2293    text a little bit longer, but the text passes more easily through
2294    several types of gateway, some of which strip off the MSB (Most
2295    Significant Bit).
2296
2297    There are two kinds of character sets: control character sets and
2298    graphic character sets.  The former contain control characters such
2299    as `newline' and `escape' to provide control functions (control
2300    functions are also provided by escape sequences).  The latter
2301    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2302    two control character sets and many graphic character sets.
2303
2304    Graphic character sets are classified into one of the following
2305    four classes, according to the number of bytes (DIMENSION) and
2306    number of characters in one dimension (CHARS) of the set:
2307    - DIMENSION1_CHARS94
2308    - DIMENSION1_CHARS96
2309    - DIMENSION2_CHARS94
2310    - DIMENSION2_CHARS96
2311
2312    In addition, each character set is assigned an identification tag,
2313    unique for each set, called the "final character" (denoted as <F>
2314    hereafter).  The <F> of each character set is decided by ECMA(*)
2315    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2316    (0x30..0x3F are for private use only).
2317
2318    Note (*): ECMA = European Computer Manufacturers Association
2319
2320    Here are examples of graphic character sets [NAME(<F>)]:
2321         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2322         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2323         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2324         o DIMENSION2_CHARS96 -- none for the moment
2325
2326    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2327         C0 [0x00..0x1F] -- control character plane 0
2328         GL [0x20..0x7F] -- graphic character plane 0
2329         C1 [0x80..0x9F] -- control character plane 1
2330         GR [0xA0..0xFF] -- graphic character plane 1
2331
2332    A control character set is directly designated and invoked to C0 or
2333    C1 by an escape sequence.  The most common case is that:
2334    - ISO646's  control character set is designated/invoked to C0, and
2335    - ISO6429's control character set is designated/invoked to C1,
2336    and usually these designations/invocations are omitted in encoded
2337    text.  In a 7-bit environment, only C0 can be used, and a control
2338    character for C1 is encoded by an appropriate escape sequence to
2339    fit into the environment.  All control characters for C1 are
2340    defined to have corresponding escape sequences.
2341
2342    A graphic character set is at first designated to one of four
2343    graphic registers (G0 through G3), then these graphic registers are
2344    invoked to GL or GR.  These designations and invocations can be
2345    done independently.  The most common case is that G0 is invoked to
2346    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2347    these invocations and designations are omitted in encoded text.
2348    In a 7-bit environment, only GL can be used.
2349
2350    When a graphic character set of CHARS94 is invoked to GL, codes
2351    0x20 and 0x7F of the GL area work as control characters SPACE and
2352    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2353    be used.
2354
2355    There are two ways of invocation: locking-shift and single-shift.
2356    With locking-shift, the invocation lasts until the next different
2357    invocation, whereas with single-shift, the invocation affects the
2358    following character only and doesn't affect the locking-shift
2359    state.  Invocations are done by the following control characters or
2360    escape sequences:
2361
2362    ----------------------------------------------------------------------
2363    abbrev  function                  cntrl escape seq   description
2364    ----------------------------------------------------------------------
2365    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2366    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2367    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2368    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2369    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2370    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2371    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2372    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2373    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2374    ----------------------------------------------------------------------
2375    (*) These are not used by any known coding system.
2376
2377    Control characters for these functions are defined by macros
2378    ISO_CODE_XXX in `coding.h'.
2379
2380    Designations are done by the following escape sequences:
2381    ----------------------------------------------------------------------
2382    escape sequence      description
2383    ----------------------------------------------------------------------
2384    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2385    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2386    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2387    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2388    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2389    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2390    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2391    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2392    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2393    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2394    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2395    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2396    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2397    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2398    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2399    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2400    ----------------------------------------------------------------------
2401
2402    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2403    of dimension 1, chars 94, and final character <F>, etc...
2404
2405    Note (*): Although these designations are not allowed in ISO2022,
2406    Emacs accepts them on decoding, and produces them on encoding
2407    CHARS96 character sets in a coding system which is characterized as
2408    7-bit environment, non-locking-shift, and non-single-shift.
2409
2410    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2411    '(' must be omitted.  We refer to this as "short-form" hereafter.
2412
2413    Now you may notice that there are a lot of ways of encoding the
2414    same multilingual text in ISO2022.  Actually, there exist many
2415    coding systems such as Compound Text (used in X11's inter client
2416    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2417    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2418    localized platforms), and all of these are variants of ISO2022.
2419
2420    In addition to the above, Emacs handles two more kinds of escape
2421    sequences: ISO6429's direction specification and Emacs' private
2422    sequence for specifying character composition.
2423
2424    ISO6429's direction specification takes the following form:
2425         o CSI ']'      -- end of the current direction
2426         o CSI '0' ']'  -- end of the current direction
2427         o CSI '1' ']'  -- start of left-to-right text
2428         o CSI '2' ']'  -- start of right-to-left text
2429    The control character CSI (0x9B: control sequence introducer) is
2430    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2431
2432    Character composition specification takes the following form:
2433         o ESC '0' -- start relative composition
2434         o ESC '1' -- end composition
2435         o ESC '2' -- start rule-base composition (*)
2436         o ESC '3' -- start relative composition with alternate chars  (**)
2437         o ESC '4' -- start rule-base composition with alternate chars  (**)
2438   Since these are not standard escape sequences of any ISO standard,
2439   the use of them with these meanings is restricted to Emacs only.
2440
2441   (*) This form is used only in Emacs 20.7 and older versions,
2442   but newer versions can safely decode it.
2443   (**) This form is used only in Emacs 21.1 and newer versions,
2444   and older versions can't decode it.
2445
2446   Here's a list of example usages of these composition escape
2447   sequences (categorized by `enum composition_method').
2448
2449   COMPOSITION_RELATIVE:
2450         ESC 0 CHAR [ CHAR ] ESC 1
2451   COMPOSITION_WITH_RULE:
2452         ESC 2 CHAR [ RULE CHAR ] ESC 1
2453   COMPOSITION_WITH_ALTCHARS:
2454         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2455   COMPOSITION_WITH_RULE_ALTCHARS:
2456         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2457
2458 enum iso_code_class_type iso_code_class[256];
2459
2460 #define SAFE_CHARSET_P(coding, id)      \
2461   ((id) <= (coding)->max_charset_id     \
2462    && (coding)->safe_charsets[id] >= 0)
2463
2464
2465 #define SHIFT_OUT_OK(category)  \
2466   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2467
2468 static void
2469 setup_iso_safe_charsets (attrs)
2470      Lisp_Object attrs;
2471 {
2472   Lisp_Object charset_list, safe_charsets;
2473   Lisp_Object request;
2474   Lisp_Object reg_usage;
2475   Lisp_Object tail;
2476   int reg94, reg96;
2477   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2478   int max_charset_id;
2479
2480   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2481   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2482       && ! EQ (charset_list, Viso_2022_charset_list))
2483     {
2484       CODING_ATTR_CHARSET_LIST (attrs)
2485         = charset_list = Viso_2022_charset_list;
2486       ASET (attrs, coding_attr_safe_charsets, Qnil);
2487     }
2488
2489   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2490     return;
2491
2492   max_charset_id = 0;
2493   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2494     {
2495       int id = XINT (XCAR (tail));
2496       if (max_charset_id < id)
2497         max_charset_id = id;
2498     }
2499
2500   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2501                                 make_number (255));
2502   request = AREF (attrs, coding_attr_iso_request);
2503   reg_usage = AREF (attrs, coding_attr_iso_usage);
2504   reg94 = XINT (XCAR (reg_usage));
2505   reg96 = XINT (XCDR (reg_usage));
2506
2507   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2508     {
2509       Lisp_Object id;
2510       Lisp_Object reg;
2511       struct charset *charset;
2512
2513       id = XCAR (tail);
2514       charset = CHARSET_FROM_ID (XINT (id));
2515       reg = Fcdr (Fassq (id, request));
2516       if (! NILP (reg))
2517         SSET (safe_charsets, XINT (id), XINT (reg));
2518       else if (charset->iso_chars_96)
2519         {
2520           if (reg96 < 4)
2521             SSET (safe_charsets, XINT (id), reg96);
2522         }
2523       else
2524         {
2525           if (reg94 < 4)
2526             SSET (safe_charsets, XINT (id), reg94);
2527         }
2528     }
2529   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2530 }
2531
2532
2533 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2534    Check if a text is encoded in one of ISO-2022 based codig systems.
2535    If it is, return 1, else return 0.  */
2536
2537 static int
2538 detect_coding_iso_2022 (coding, detect_info)
2539      struct coding_system *coding;
2540      struct coding_detection_info *detect_info;
2541 {
2542   const unsigned char *src = coding->source, *src_base = src;
2543   const unsigned char *src_end = coding->source + coding->src_bytes;
2544   int multibytep = coding->src_multibyte;
2545   int single_shifting = 0;
2546   int id;
2547   int c, c1;
2548   int consumed_chars = 0;
2549   int i;
2550   int rejected = 0;
2551   int found = 0;
2552
2553   detect_info->checked |= CATEGORY_MASK_ISO;
2554
2555   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2556     {
2557       struct coding_system *this = &(coding_categories[i]);
2558       Lisp_Object attrs, val;
2559
2560       attrs = CODING_ID_ATTRS (this->id);
2561       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2562           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2563         setup_iso_safe_charsets (attrs);
2564       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2565       this->max_charset_id = SCHARS (val) - 1;
2566       this->safe_charsets = (char *) SDATA (val);
2567     }
2568
2569   /* A coding system of this category is always ASCII compatible.  */
2570   src += coding->head_ascii;
2571
2572   while (rejected != CATEGORY_MASK_ISO)
2573     {
2574       src_base = src;
2575       ONE_MORE_BYTE (c);
2576       switch (c)
2577         {
2578         case ISO_CODE_ESC:
2579           if (inhibit_iso_escape_detection)
2580             break;
2581           single_shifting = 0;
2582           ONE_MORE_BYTE (c);
2583           if (c >= '(' && c <= '/')
2584             {
2585               /* Designation sequence for a charset of dimension 1.  */
2586               ONE_MORE_BYTE (c1);
2587               if (c1 < ' ' || c1 >= 0x80
2588                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2589                 /* Invalid designation sequence.  Just ignore.  */
2590                 break;
2591             }
2592           else if (c == '$')
2593             {
2594               /* Designation sequence for a charset of dimension 2.  */
2595               ONE_MORE_BYTE (c);
2596               if (c >= '@' && c <= 'B')
2597                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2598                 id = iso_charset_table[1][0][c];
2599               else if (c >= '(' && c <= '/')
2600                 {
2601                   ONE_MORE_BYTE (c1);
2602                   if (c1 < ' ' || c1 >= 0x80
2603                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2604                     /* Invalid designation sequence.  Just ignore.  */
2605                     break;
2606                 }
2607               else
2608                 /* Invalid designation sequence.  Just ignore it.  */
2609                 break;
2610             }
2611           else if (c == 'N' || c == 'O')
2612             {
2613               /* ESC <Fe> for SS2 or SS3.  */
2614               single_shifting = 1;
2615               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2616               break;
2617             }
2618           else if (c >= '0' && c <= '4')
2619             {
2620               /* ESC <Fp> for start/end composition.  */
2621               found |= CATEGORY_MASK_ISO;
2622               break;
2623             }
2624           else
2625             {
2626               /* Invalid escape sequence.  Just ignore it.  */
2627               break;
2628             }
2629
2630           /* We found a valid designation sequence for CHARSET.  */
2631           rejected |= CATEGORY_MASK_ISO_8BIT;
2632           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2633                               id))
2634             found |= CATEGORY_MASK_ISO_7;
2635           else
2636             rejected |= CATEGORY_MASK_ISO_7;
2637           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2638                               id))
2639             found |= CATEGORY_MASK_ISO_7_TIGHT;
2640           else
2641             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2642           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2643                               id))
2644             found |= CATEGORY_MASK_ISO_7_ELSE;
2645           else
2646             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2647           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2648                               id))
2649             found |= CATEGORY_MASK_ISO_8_ELSE;
2650           else
2651             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2652           break;
2653
2654         case ISO_CODE_SO:
2655         case ISO_CODE_SI:
2656           /* Locking shift out/in.  */
2657           if (inhibit_iso_escape_detection)
2658             break;
2659           single_shifting = 0;
2660           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2661           found |= CATEGORY_MASK_ISO_ELSE;
2662           break;
2663
2664         case ISO_CODE_CSI:
2665           /* Control sequence introducer.  */
2666           single_shifting = 0;
2667           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2668           found |= CATEGORY_MASK_ISO_8_ELSE;
2669           goto check_extra_latin;
2670
2671         case ISO_CODE_SS2:
2672         case ISO_CODE_SS3:
2673           /* Single shift.   */
2674           if (inhibit_iso_escape_detection)
2675             break;
2676           single_shifting = 0;
2677           rejected |= CATEGORY_MASK_ISO_7BIT;
2678           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2679               & CODING_ISO_FLAG_SINGLE_SHIFT)
2680             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2681           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2682               & CODING_ISO_FLAG_SINGLE_SHIFT)
2683             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2684           if (single_shifting)
2685             break;
2686           goto check_extra_latin;
2687
2688         default:
2689           if (c < 0)
2690             continue;
2691           if (c < 0x80)
2692             {
2693               single_shifting = 0;
2694               break;
2695             }
2696           if (c >= 0xA0)
2697             {
2698               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2699               found |= CATEGORY_MASK_ISO_8_1;
2700               /* Check the length of succeeding codes of the range
2701                  0xA0..0FF.  If the byte length is even, we include
2702                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2703                  only when we are not single shifting.  */
2704               if (! single_shifting
2705                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2706                 {
2707                   int i = 1;
2708                   while (src < src_end)
2709                     {
2710                       ONE_MORE_BYTE (c);
2711                       if (c < 0xA0)
2712                         break;
2713                       i++;
2714                     }
2715
2716                   if (i & 1 && src < src_end)
2717                     rejected |= CATEGORY_MASK_ISO_8_2;
2718                   else
2719                     found |= CATEGORY_MASK_ISO_8_2;
2720                 }
2721               break;
2722             }
2723         check_extra_latin:
2724           single_shifting = 0;
2725           if (! VECTORP (Vlatin_extra_code_table)
2726               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2727             {
2728               rejected = CATEGORY_MASK_ISO;
2729               break;
2730             }
2731           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2732               & CODING_ISO_FLAG_LATIN_EXTRA)
2733             found |= CATEGORY_MASK_ISO_8_1;
2734           else
2735             rejected |= CATEGORY_MASK_ISO_8_1;
2736           rejected |= CATEGORY_MASK_ISO_8_2;
2737         }
2738     }
2739   detect_info->rejected |= CATEGORY_MASK_ISO;
2740   return 0;
2741
2742  no_more_source:
2743   detect_info->rejected |= rejected;
2744   detect_info->found |= (found & ~rejected);
2745   return 1;
2746 }
2747
2748
2749 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2750    escape sequence should be kept.  */
2751 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2752   do {                                                                  \
2753     int id, prev;                                                       \
2754                                                                         \
2755     if (final < '0' || final >= 128                                     \
2756         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2757         || !SAFE_CHARSET_P (coding, id))                                \
2758       {                                                                 \
2759         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2760         chars_96 = -1;                                                  \
2761         break;                                                          \
2762       }                                                                 \
2763     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2764     if (id == charset_jisx0201_roman)                                   \
2765       {                                                                 \
2766         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2767           id = charset_ascii;                                           \
2768       }                                                                 \
2769     else if (id == charset_jisx0208_1978)                               \
2770       {                                                                 \
2771         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2772           id = charset_jisx0208;                                        \
2773       }                                                                 \
2774     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2775     /* If there was an invalid designation to REG previously, and this  \
2776        designation is ASCII to REG, we should keep this designation     \
2777        sequence.  */                                                    \
2778     if (prev == -2 && id == charset_ascii)                              \
2779       chars_96 = -1;                                                    \
2780   } while (0)
2781
2782
2783 #define MAYBE_FINISH_COMPOSITION()                              \
2784   do {                                                          \
2785     int i;                                                      \
2786     if (composition_state == COMPOSING_NO)                      \
2787       break;                                                    \
2788     /* It is assured that we have enough room for producing     \
2789        characters stored in the table `components'.  */         \
2790     if (charbuf + component_idx > charbuf_end)                  \
2791       goto no_more_source;                                      \
2792     composition_state = COMPOSING_NO;                           \
2793     if (method == COMPOSITION_RELATIVE                          \
2794         || method == COMPOSITION_WITH_ALTCHARS)                 \
2795       {                                                         \
2796         for (i = 0; i < component_idx; i++)                     \
2797           *charbuf++ = components[i];                           \
2798         char_offset += component_idx;                           \
2799       }                                                         \
2800     else                                                        \
2801       {                                                         \
2802         for (i = 0; i < component_idx; i += 2)                  \
2803           *charbuf++ = components[i];                           \
2804         char_offset += (component_idx / 2) + 1;                 \
2805       }                                                         \
2806   } while (0)
2807
2808
2809 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2810    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2811    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2812    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2813    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2814   */
2815
2816 #define DECODE_COMPOSITION_START(c1)                                    \
2817   do {                                                                  \
2818     if (c1 == '0'                                                       \
2819         && composition_state == COMPOSING_COMPONENT_RULE)               \
2820       {                                                                 \
2821         component_len = component_idx;                                  \
2822         composition_state = COMPOSING_CHAR;                             \
2823       }                                                                 \
2824     else                                                                \
2825       {                                                                 \
2826         const unsigned char *p;                                         \
2827                                                                         \
2828         MAYBE_FINISH_COMPOSITION ();                                    \
2829         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2830           goto no_more_source;                                          \
2831         for (p = src; p < src_end - 1; p++)                             \
2832           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2833             break;                                                      \
2834         if (p == src_end - 1)                                           \
2835           {                                                             \
2836             /* The current composition doesn't end in the current       \
2837                source.  */                                              \
2838             record_conversion_result                                    \
2839               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
2840             goto no_more_source;                                        \
2841           }                                                             \
2842                                                                         \
2843         /* This is surely the start of a composition.  */               \
2844         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2845                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2846                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2847                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2848         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2849                              : COMPOSING_COMPONENT_CHAR);               \
2850         component_idx = component_len = 0;                              \
2851       }                                                                 \
2852   } while (0)
2853
2854
2855 /* Handle compositoin end sequence ESC 1.  */
2856
2857 #define DECODE_COMPOSITION_END()                                        \
2858   do {                                                                  \
2859     int nchars = (component_len > 0 ? component_idx - component_len     \
2860                   : method == COMPOSITION_RELATIVE ? component_idx      \
2861                   : (component_idx + 1) / 2);                           \
2862     int i;                                                              \
2863     int *saved_charbuf = charbuf;                                       \
2864                                                                         \
2865     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2866     if (method != COMPOSITION_RELATIVE)                                 \
2867       {                                                                 \
2868         if (component_len == 0)                                         \
2869           for (i = 0; i < component_idx; i++)                           \
2870             *charbuf++ = components[i];                                 \
2871         else                                                            \
2872           for (i = 0; i < component_len; i++)                           \
2873             *charbuf++ = components[i];                                 \
2874         *saved_charbuf = saved_charbuf - charbuf;                       \
2875       }                                                                 \
2876     if (method == COMPOSITION_WITH_RULE)                                \
2877       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2878         *charbuf++ = components[i];                                     \
2879     else                                                                \
2880       for (i = component_len; i < component_idx; i++, char_offset++)    \
2881         *charbuf++ = components[i];                                     \
2882     coding->annotated = 1;                                              \
2883     composition_state = COMPOSING_NO;                                   \
2884   } while (0)
2885
2886
2887 /* Decode a composition rule from the byte C1 (and maybe one more byte
2888    from SRC) and store one encoded composition rule in
2889    coding->cmp_data.  */
2890
2891 #define DECODE_COMPOSITION_RULE(c1)                                     \
2892   do {                                                                  \
2893     (c1) -= 32;                                                         \
2894     if (c1 < 81)                /* old format (before ver.21) */        \
2895       {                                                                 \
2896         int gref = (c1) / 9;                                            \
2897         int nref = (c1) % 9;                                            \
2898         if (gref == 4) gref = 10;                                       \
2899         if (nref == 4) nref = 10;                                       \
2900         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2901       }                                                                 \
2902     else if (c1 < 93)           /* new format (after ver.21) */         \
2903       {                                                                 \
2904         ONE_MORE_BYTE (c2);                                             \
2905         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2906       }                                                                 \
2907     else                                                                \
2908       c1 = 0;                                                           \
2909   } while (0)
2910
2911
2912 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2913
2914 static void
2915 decode_coding_iso_2022 (coding)
2916      struct coding_system *coding;
2917 {
2918   const unsigned char *src = coding->source + coding->consumed;
2919   const unsigned char *src_end = coding->source + coding->src_bytes;
2920   const unsigned char *src_base;
2921   int *charbuf = coding->charbuf + coding->charbuf_used;
2922   int *charbuf_end
2923     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2924   int consumed_chars = 0, consumed_chars_base;
2925   int multibytep = coding->src_multibyte;
2926   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2927   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2928   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2929   int charset_id_2, charset_id_3;
2930   struct charset *charset;
2931   int c;
2932   /* For handling composition sequence.  */
2933 #define COMPOSING_NO                    0
2934 #define COMPOSING_CHAR                  1
2935 #define COMPOSING_RULE                  2
2936 #define COMPOSING_COMPONENT_CHAR        3
2937 #define COMPOSING_COMPONENT_RULE        4
2938
2939   int composition_state = COMPOSING_NO;
2940   enum composition_method method;
2941   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2942   int component_idx;
2943   int component_len;
2944   Lisp_Object attrs, charset_list;
2945   int char_offset = coding->produced_char;
2946   int last_offset = char_offset;
2947   int last_id = charset_ascii;
2948
2949   CODING_GET_INFO (coding, attrs, charset_list);
2950   setup_iso_safe_charsets (attrs);
2951
2952   while (1)
2953     {
2954       int c1, c2;
2955
2956       src_base = src;
2957       consumed_chars_base = consumed_chars;
2958
2959       if (charbuf >= charbuf_end)
2960         break;
2961
2962       ONE_MORE_BYTE (c1);
2963       if (c1 < 0)
2964         goto invalid_code;
2965
2966       /* We produce at most one character.  */
2967       switch (iso_code_class [c1])
2968         {
2969         case ISO_0x20_or_0x7F:
2970           if (composition_state != COMPOSING_NO)
2971             {
2972               if (composition_state == COMPOSING_RULE
2973                   || composition_state == COMPOSING_COMPONENT_RULE)
2974                 {
2975                   DECODE_COMPOSITION_RULE (c1);
2976                   components[component_idx++] = c1;
2977                   composition_state--;
2978                   continue;
2979                 }
2980             }
2981           if (charset_id_0 < 0
2982               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2983             /* This is SPACE or DEL.  */
2984             charset = CHARSET_FROM_ID (charset_ascii);
2985           else
2986             charset = CHARSET_FROM_ID (charset_id_0);
2987           break;
2988
2989         case ISO_graphic_plane_0:
2990           if (composition_state != COMPOSING_NO)
2991             {
2992               if (composition_state == COMPOSING_RULE
2993                   || composition_state == COMPOSING_COMPONENT_RULE)
2994                 {
2995                   DECODE_COMPOSITION_RULE (c1);
2996                   components[component_idx++] = c1;
2997                   composition_state--;
2998                   continue;
2999                 }
3000             }
3001           if (charset_id_0 < 0)
3002             charset = CHARSET_FROM_ID (charset_ascii);
3003           else
3004             charset = CHARSET_FROM_ID (charset_id_0);
3005           break;
3006
3007         case ISO_0xA0_or_0xFF:
3008           if (charset_id_1 < 0
3009               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3010               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3011             goto invalid_code;
3012           /* This is a graphic character, we fall down ... */
3013
3014         case ISO_graphic_plane_1:
3015           if (charset_id_1 < 0)
3016             goto invalid_code;
3017           charset = CHARSET_FROM_ID (charset_id_1);
3018           break;
3019
3020         case ISO_control_0:
3021           MAYBE_FINISH_COMPOSITION ();
3022           charset = CHARSET_FROM_ID (charset_ascii);
3023           break;
3024
3025         case ISO_control_1:
3026           MAYBE_FINISH_COMPOSITION ();
3027           goto invalid_code;
3028
3029         case ISO_shift_out:
3030           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3031               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3032             goto invalid_code;
3033           CODING_ISO_INVOCATION (coding, 0) = 1;
3034           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3035           continue;
3036
3037         case ISO_shift_in:
3038           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3039             goto invalid_code;
3040           CODING_ISO_INVOCATION (coding, 0) = 0;
3041           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3042           continue;
3043
3044         case ISO_single_shift_2_7:
3045         case ISO_single_shift_2:
3046           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3047             goto invalid_code;
3048           /* SS2 is handled as an escape sequence of ESC 'N' */
3049           c1 = 'N';
3050           goto label_escape_sequence;
3051
3052         case ISO_single_shift_3:
3053           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3054             goto invalid_code;
3055           /* SS2 is handled as an escape sequence of ESC 'O' */
3056           c1 = 'O';
3057           goto label_escape_sequence;
3058
3059         case ISO_control_sequence_introducer:
3060           /* CSI is handled as an escape sequence of ESC '[' ...  */
3061           c1 = '[';
3062           goto label_escape_sequence;
3063
3064         case ISO_escape:
3065           ONE_MORE_BYTE (c1);
3066         label_escape_sequence:
3067           /* Escape sequences handled here are invocation,
3068              designation, direction specification, and character
3069              composition specification.  */
3070           switch (c1)
3071             {
3072             case '&':           /* revision of following character set */
3073               ONE_MORE_BYTE (c1);
3074               if (!(c1 >= '@' && c1 <= '~'))
3075                 goto invalid_code;
3076               ONE_MORE_BYTE (c1);
3077               if (c1 != ISO_CODE_ESC)
3078                 goto invalid_code;
3079               ONE_MORE_BYTE (c1);
3080               goto label_escape_sequence;
3081
3082             case '$':           /* designation of 2-byte character set */
3083               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3084                 goto invalid_code;
3085               {
3086                 int reg, chars96;
3087
3088                 ONE_MORE_BYTE (c1);
3089                 if (c1 >= '@' && c1 <= 'B')
3090                   {     /* designation of JISX0208.1978, GB2312.1980,
3091                            or JISX0208.1980 */
3092                     reg = 0, chars96 = 0;
3093                   }
3094                 else if (c1 >= 0x28 && c1 <= 0x2B)
3095                   { /* designation of DIMENSION2_CHARS94 character set */
3096                     reg = c1 - 0x28, chars96 = 0;
3097                     ONE_MORE_BYTE (c1);
3098                   }
3099                 else if (c1 >= 0x2C && c1 <= 0x2F)
3100                   { /* designation of DIMENSION2_CHARS96 character set */
3101                     reg = c1 - 0x2C, chars96 = 1;
3102                     ONE_MORE_BYTE (c1);
3103                   }
3104                 else
3105                   goto invalid_code;
3106                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3107                 /* We must update these variables now.  */
3108                 if (reg == 0)
3109                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3110                 else if (reg == 1)
3111                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3112                 if (chars96 < 0)
3113                   goto invalid_code;
3114               }
3115               continue;
3116
3117             case 'n':           /* invocation of locking-shift-2 */
3118               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3119                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3120                 goto invalid_code;
3121               CODING_ISO_INVOCATION (coding, 0) = 2;
3122               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3123               continue;
3124
3125             case 'o':           /* invocation of locking-shift-3 */
3126               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3127                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3128                 goto invalid_code;
3129               CODING_ISO_INVOCATION (coding, 0) = 3;
3130               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3131               continue;
3132
3133             case 'N':           /* invocation of single-shift-2 */
3134               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3135                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3136                 goto invalid_code;
3137               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3138               if (charset_id_2 < 0)
3139                 charset = CHARSET_FROM_ID (charset_ascii);
3140               else
3141                 charset = CHARSET_FROM_ID (charset_id_2);
3142               ONE_MORE_BYTE (c1);
3143               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3144                 goto invalid_code;
3145               break;
3146
3147             case 'O':           /* invocation of single-shift-3 */
3148               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3149                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3150                 goto invalid_code;
3151               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3152               if (charset_id_3 < 0)
3153                 charset = CHARSET_FROM_ID (charset_ascii);
3154               else
3155                 charset = CHARSET_FROM_ID (charset_id_3);
3156               ONE_MORE_BYTE (c1);
3157               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3158                 goto invalid_code;
3159               break;
3160
3161             case '0': case '2': case '3': case '4': /* start composition */
3162               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3163                 goto invalid_code;
3164               DECODE_COMPOSITION_START (c1);
3165               continue;
3166
3167             case '1':           /* end composition */
3168               if (composition_state == COMPOSING_NO)
3169                 goto invalid_code;
3170               DECODE_COMPOSITION_END ();
3171               continue;
3172
3173             case '[':           /* specification of direction */
3174               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3175                 goto invalid_code;
3176               /* For the moment, nested direction is not supported.
3177                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3178                  left-to-right, and nozero means right-to-left.  */
3179               ONE_MORE_BYTE (c1);
3180               switch (c1)
3181                 {
3182                 case ']':       /* end of the current direction */
3183                   coding->mode &= ~CODING_MODE_DIRECTION;
3184
3185                 case '0':       /* end of the current direction */
3186                 case '1':       /* start of left-to-right direction */
3187                   ONE_MORE_BYTE (c1);
3188                   if (c1 == ']')
3189                     coding->mode &= ~CODING_MODE_DIRECTION;
3190                   else
3191                     goto invalid_code;
3192                   break;
3193
3194                 case '2':       /* start of right-to-left direction */
3195                   ONE_MORE_BYTE (c1);
3196                   if (c1 == ']')
3197                     coding->mode |= CODING_MODE_DIRECTION;
3198                   else
3199                     goto invalid_code;
3200                   break;
3201
3202                 default:
3203                   goto invalid_code;
3204                 }
3205               continue;
3206
3207             case '%':
3208               ONE_MORE_BYTE (c1);
3209               if (c1 == '/')
3210                 {
3211                   /* CTEXT extended segment:
3212                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3213                      We keep these bytes as is for the moment.
3214                      They may be decoded by post-read-conversion.  */
3215                   int dim, M, L;
3216                   int size;
3217
3218                   ONE_MORE_BYTE (dim);
3219                   ONE_MORE_BYTE (M);
3220                   ONE_MORE_BYTE (L);
3221                   size = ((M - 128) * 128) + (L - 128);
3222                   if (charbuf + 8 + size > charbuf_end)
3223                     goto break_loop;
3224                   *charbuf++ = ISO_CODE_ESC;
3225                   *charbuf++ = '%';
3226                   *charbuf++ = '/';
3227                   *charbuf++ = dim;
3228                   *charbuf++ = BYTE8_TO_CHAR (M);
3229                   *charbuf++ = BYTE8_TO_CHAR (L);
3230                   while (size-- > 0)
3231                     {
3232                       ONE_MORE_BYTE (c1);
3233                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3234                     }
3235                 }
3236               else if (c1 == 'G')
3237                 {
3238                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3239                      ESC % G --UTF-8-BYTES-- ESC % @
3240                      We keep these bytes as is for the moment.
3241                      They may be decoded by post-read-conversion.  */
3242                   int *p = charbuf;
3243
3244                   if (p + 6 > charbuf_end)
3245                     goto break_loop;
3246                   *p++ = ISO_CODE_ESC;
3247                   *p++ = '%';
3248                   *p++ = 'G';
3249                   while (p < charbuf_end)
3250                     {
3251                       ONE_MORE_BYTE (c1);
3252                       if (c1 == ISO_CODE_ESC
3253                           && src + 1 < src_end
3254                           && src[0] == '%'
3255                           && src[1] == '@')
3256                         {
3257                           src += 2;
3258                           break;
3259                         }
3260                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3261                     }
3262                   if (p + 3 > charbuf_end)
3263                     goto break_loop;
3264                   *p++ = ISO_CODE_ESC;
3265                   *p++ = '%';
3266                   *p++ = '@';
3267                   charbuf = p;
3268                 }
3269               else
3270                 goto invalid_code;
3271               continue;
3272               break;
3273
3274             default:
3275               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3276                 goto invalid_code;
3277               {
3278                 int reg, chars96;
3279
3280                 if (c1 >= 0x28 && c1 <= 0x2B)
3281                   { /* designation of DIMENSION1_CHARS94 character set */
3282                     reg = c1 - 0x28, chars96 = 0;
3283                     ONE_MORE_BYTE (c1);
3284                   }
3285                 else if (c1 >= 0x2C && c1 <= 0x2F)
3286                   { /* designation of DIMENSION1_CHARS96 character set */
3287                     reg = c1 - 0x2C, chars96 = 1;
3288                     ONE_MORE_BYTE (c1);
3289                   }
3290                 else
3291                   goto invalid_code;
3292                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3293                 /* We must update these variables now.  */
3294                 if (reg == 0)
3295                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3296                 else if (reg == 1)
3297                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3298                 if (chars96 < 0)
3299                   goto invalid_code;
3300               }
3301               continue;
3302             }
3303         }
3304
3305       if (charset->id != charset_ascii
3306           && last_id != charset->id)
3307         {
3308           if (last_id != charset_ascii)
3309             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3310           last_id = charset->id;
3311           last_offset = char_offset;
3312         }
3313
3314       /* Now we know CHARSET and 1st position code C1 of a character.
3315          Produce a decoded character while getting 2nd position code
3316          C2 if necessary.  */
3317       c1 &= 0x7F;
3318       if (CHARSET_DIMENSION (charset) > 1)
3319         {
3320           ONE_MORE_BYTE (c2);
3321           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3322             /* C2 is not in a valid range.  */
3323             goto invalid_code;
3324           c1 = (c1 << 8) | (c2 & 0x7F);
3325           if (CHARSET_DIMENSION (charset) > 2)
3326             {
3327               ONE_MORE_BYTE (c2);
3328               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3329                 /* C2 is not in a valid range.  */
3330                 goto invalid_code;
3331               c1 = (c1 << 8) | (c2 & 0x7F);
3332             }
3333         }
3334
3335       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3336       if (c < 0)
3337         {
3338           MAYBE_FINISH_COMPOSITION ();
3339           for (; src_base < src; src_base++, char_offset++)
3340             {
3341               if (ASCII_BYTE_P (*src_base))
3342                 *charbuf++ = *src_base;
3343               else
3344                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3345             }
3346         }
3347       else if (composition_state == COMPOSING_NO)
3348         {
3349           *charbuf++ = c;
3350           char_offset++;
3351         }
3352       else
3353         {
3354           components[component_idx++] = c;
3355           if (method == COMPOSITION_WITH_RULE
3356               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3357                   && composition_state == COMPOSING_COMPONENT_CHAR))
3358             composition_state++;
3359         }
3360       continue;
3361
3362     invalid_code:
3363       MAYBE_FINISH_COMPOSITION ();
3364       src = src_base;
3365       consumed_chars = consumed_chars_base;
3366       ONE_MORE_BYTE (c);
3367       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3368       char_offset++;
3369       coding->errors++;
3370       continue;
3371
3372     break_loop:
3373       break;
3374     }
3375
3376  no_more_source:
3377   if (last_id != charset_ascii)
3378     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3379   coding->consumed_char += consumed_chars_base;
3380   coding->consumed = src_base - coding->source;
3381   coding->charbuf_used = charbuf - coding->charbuf;
3382 }
3383
3384
3385 /* ISO2022 encoding stuff.  */
3386
3387 /*
3388    It is not enough to say just "ISO2022" on encoding, we have to
3389    specify more details.  In Emacs, each coding system of ISO2022
3390    variant has the following specifications:
3391         1. Initial designation to G0 thru G3.
3392         2. Allows short-form designation?
3393         3. ASCII should be designated to G0 before control characters?
3394         4. ASCII should be designated to G0 at end of line?
3395         5. 7-bit environment or 8-bit environment?
3396         6. Use locking-shift?
3397         7. Use Single-shift?
3398    And the following two are only for Japanese:
3399         8. Use ASCII in place of JIS0201-1976-Roman?
3400         9. Use JISX0208-1983 in place of JISX0208-1978?
3401    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3402    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3403    details.
3404 */
3405
3406 /* Produce codes (escape sequence) for designating CHARSET to graphic
3407    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3408    '@', 'A', or 'B' and the coding system CODING allows, produce
3409    designation sequence of short-form.  */
3410
3411 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3412   do {                                                                  \
3413     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3414     char *intermediate_char_94 = "()*+";                                \
3415     char *intermediate_char_96 = ",-./";                                \
3416     int revision = -1;                                                  \
3417     int c;                                                              \
3418                                                                         \
3419     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3420       revision = CHARSET_ISO_REVISION (charset);                        \
3421                                                                         \
3422     if (revision >= 0)                                                  \
3423       {                                                                 \
3424         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3425         EMIT_ONE_BYTE ('@' + revision);                                 \
3426       }                                                                 \
3427     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3428     if (CHARSET_DIMENSION (charset) == 1)                               \
3429       {                                                                 \
3430         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3431           c = intermediate_char_94[reg];                                \
3432         else                                                            \
3433           c = intermediate_char_96[reg];                                \
3434         EMIT_ONE_ASCII_BYTE (c);                                        \
3435       }                                                                 \
3436     else                                                                \
3437       {                                                                 \
3438         EMIT_ONE_ASCII_BYTE ('$');                                      \
3439         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3440           {                                                             \
3441             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3442                 || reg != 0                                             \
3443                 || final_char < '@' || final_char > 'B')                \
3444               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3445           }                                                             \
3446         else                                                            \
3447           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3448       }                                                                 \
3449     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3450                                                                         \
3451     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3452   } while (0)
3453
3454
3455 /* The following two macros produce codes (control character or escape
3456    sequence) for ISO2022 single-shift functions (single-shift-2 and
3457    single-shift-3).  */
3458
3459 #define ENCODE_SINGLE_SHIFT_2                                           \
3460   do {                                                                  \
3461     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3462       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3463     else                                                                \
3464       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3465     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3466   } while (0)
3467
3468
3469 #define ENCODE_SINGLE_SHIFT_3                                           \
3470   do {                                                                  \
3471     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3472       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3473     else                                                                \
3474       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3475     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3476   } while (0)
3477
3478
3479 /* The following four macros produce codes (control character or
3480    escape sequence) for ISO2022 locking-shift functions (shift-in,
3481    shift-out, locking-shift-2, and locking-shift-3).  */
3482
3483 #define ENCODE_SHIFT_IN                                 \
3484   do {                                                  \
3485     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3486     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3487   } while (0)
3488
3489
3490 #define ENCODE_SHIFT_OUT                                \
3491   do {                                                  \
3492     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3493     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3494   } while (0)
3495
3496
3497 #define ENCODE_LOCKING_SHIFT_2                          \
3498   do {                                                  \
3499     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3500     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3501   } while (0)
3502
3503
3504 #define ENCODE_LOCKING_SHIFT_3                          \
3505   do {                                                  \
3506     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3507     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3508   } while (0)
3509
3510
3511 /* Produce codes for a DIMENSION1 character whose character set is
3512    CHARSET and whose position-code is C1.  Designation and invocation
3513    sequences are also produced in advance if necessary.  */
3514
3515 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3516   do {                                                                  \
3517     int id = CHARSET_ID (charset);                                      \
3518                                                                         \
3519     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3520         && id == charset_ascii)                                         \
3521       {                                                                 \
3522         id = charset_jisx0201_roman;                                    \
3523         charset = CHARSET_FROM_ID (id);                                 \
3524       }                                                                 \
3525                                                                         \
3526     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3527       {                                                                 \
3528         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3529           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3530         else                                                            \
3531           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3532         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3533         break;                                                          \
3534       }                                                                 \
3535     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3536       {                                                                 \
3537         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3538         break;                                                          \
3539       }                                                                 \
3540     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3541       {                                                                 \
3542         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3543         break;                                                          \
3544       }                                                                 \
3545     else                                                                \
3546       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3547          must invoke it, or, at first, designate it to some graphic     \
3548          register.  Then repeat the loop to actually produce the        \
3549          character.  */                                                 \
3550       dst = encode_invocation_designation (charset, coding, dst,        \
3551                                            &produced_chars);            \
3552   } while (1)
3553
3554
3555 /* Produce codes for a DIMENSION2 character whose character set is
3556    CHARSET and whose position-codes are C1 and C2.  Designation and
3557    invocation codes are also produced in advance if necessary.  */
3558
3559 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3560   do {                                                                  \
3561     int id = CHARSET_ID (charset);                                      \
3562                                                                         \
3563     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3564         && id == charset_jisx0208)                                      \
3565       {                                                                 \
3566         id = charset_jisx0208_1978;                                     \
3567         charset = CHARSET_FROM_ID (id);                                 \
3568       }                                                                 \
3569                                                                         \
3570     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3571       {                                                                 \
3572         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3573           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3574         else                                                            \
3575           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3576         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3577         break;                                                          \
3578       }                                                                 \
3579     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3580       {                                                                 \
3581         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3582         break;                                                          \
3583       }                                                                 \
3584     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3585       {                                                                 \
3586         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3587         break;                                                          \
3588       }                                                                 \
3589     else                                                                \
3590       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3591          must invoke it, or, at first, designate it to some graphic     \
3592          register.  Then repeat the loop to actually produce the        \
3593          character.  */                                                 \
3594       dst = encode_invocation_designation (charset, coding, dst,        \
3595                                            &produced_chars);            \
3596   } while (1)
3597
3598
3599 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3600   do {                                                                     \
3601     int code = ENCODE_CHAR ((charset),(c));                                \
3602                                                                            \
3603     if (CHARSET_DIMENSION (charset) == 1)                                  \
3604       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3605     else                                                                   \
3606       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3607   } while (0)
3608
3609
3610 /* Produce designation and invocation codes at a place pointed by DST
3611    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3612    Return new DST.  */
3613
3614 unsigned char *
3615 encode_invocation_designation (charset, coding, dst, p_nchars)
3616      struct charset *charset;
3617      struct coding_system *coding;
3618      unsigned char *dst;
3619      int *p_nchars;
3620 {
3621   int multibytep = coding->dst_multibyte;
3622   int produced_chars = *p_nchars;
3623   int reg;                      /* graphic register number */
3624   int id = CHARSET_ID (charset);
3625
3626   /* At first, check designations.  */
3627   for (reg = 0; reg < 4; reg++)
3628     if (id == CODING_ISO_DESIGNATION (coding, reg))
3629       break;
3630
3631   if (reg >= 4)
3632     {
3633       /* CHARSET is not yet designated to any graphic registers.  */
3634       /* At first check the requested designation.  */
3635       reg = CODING_ISO_REQUEST (coding, id);
3636       if (reg < 0)
3637         /* Since CHARSET requests no special designation, designate it
3638            to graphic register 0.  */
3639         reg = 0;
3640
3641       ENCODE_DESIGNATION (charset, reg, coding);
3642     }
3643
3644   if (CODING_ISO_INVOCATION (coding, 0) != reg
3645       && CODING_ISO_INVOCATION (coding, 1) != reg)
3646     {
3647       /* Since the graphic register REG is not invoked to any graphic
3648          planes, invoke it to graphic plane 0.  */
3649       switch (reg)
3650         {
3651         case 0:                 /* graphic register 0 */
3652           ENCODE_SHIFT_IN;
3653           break;
3654
3655         case 1:                 /* graphic register 1 */
3656           ENCODE_SHIFT_OUT;
3657           break;
3658
3659         case 2:                 /* graphic register 2 */
3660           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3661             ENCODE_SINGLE_SHIFT_2;
3662           else
3663             ENCODE_LOCKING_SHIFT_2;
3664           break;
3665
3666         case 3:                 /* graphic register 3 */
3667           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3668             ENCODE_SINGLE_SHIFT_3;
3669           else
3670             ENCODE_LOCKING_SHIFT_3;
3671           break;
3672         }
3673     }
3674
3675   *p_nchars = produced_chars;
3676   return dst;
3677 }
3678
3679 /* The following three macros produce codes for indicating direction
3680    of text.  */
3681 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3682   do {                                                                  \
3683     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3684       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3685     else                                                                \
3686       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3687   } while (0)
3688
3689
3690 #define ENCODE_DIRECTION_R2L()                  \
3691   do {                                          \
3692     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3693     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3694   } while (0)
3695
3696
3697 #define ENCODE_DIRECTION_L2R()                  \
3698   do {                                          \
3699     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3700     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3701   } while (0)
3702
3703
3704 /* Produce codes for designation and invocation to reset the graphic
3705    planes and registers to initial state.  */
3706 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3707   do {                                                                  \
3708     int reg;                                                            \
3709     struct charset *charset;                                            \
3710                                                                         \
3711     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3712       ENCODE_SHIFT_IN;                                                  \
3713     for (reg = 0; reg < 4; reg++)                                       \
3714       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3715           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3716               != CODING_ISO_INITIAL (coding, reg)))                     \
3717         {                                                               \
3718           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3719           ENCODE_DESIGNATION (charset, reg, coding);                    \
3720         }                                                               \
3721   } while (0)
3722
3723
3724 /* Produce designation sequences of charsets in the line started from
3725    SRC to a place pointed by DST, and return updated DST.
3726
3727    If the current block ends before any end-of-line, we may fail to
3728    find all the necessary designations.  */
3729
3730 static unsigned char *
3731 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3732      struct coding_system *coding;
3733      int *charbuf, *charbuf_end;
3734      unsigned char *dst;
3735 {
3736   struct charset *charset;
3737   /* Table of charsets to be designated to each graphic register.  */
3738   int r[4];
3739   int c, found = 0, reg;
3740   int produced_chars = 0;
3741   int multibytep = coding->dst_multibyte;
3742   Lisp_Object attrs;
3743   Lisp_Object charset_list;
3744
3745   attrs = CODING_ID_ATTRS (coding->id);
3746   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3747   if (EQ (charset_list, Qiso_2022))
3748     charset_list = Viso_2022_charset_list;
3749
3750   for (reg = 0; reg < 4; reg++)
3751     r[reg] = -1;
3752
3753   while (found < 4)
3754     {
3755       int id;
3756
3757       c = *charbuf++;
3758       if (c == '\n')
3759         break;
3760       charset = char_charset (c, charset_list, NULL);
3761       id = CHARSET_ID (charset);
3762       reg = CODING_ISO_REQUEST (coding, id);
3763       if (reg >= 0 && r[reg] < 0)
3764         {
3765           found++;
3766           r[reg] = id;
3767         }
3768     }
3769
3770   if (found)
3771     {
3772       for (reg = 0; reg < 4; reg++)
3773         if (r[reg] >= 0
3774             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3775           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3776     }
3777
3778   return dst;
3779 }
3780
3781 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3782
3783 static int
3784 encode_coding_iso_2022 (coding)
3785      struct coding_system *coding;
3786 {
3787   int multibytep = coding->dst_multibyte;
3788   int *charbuf = coding->charbuf;
3789   int *charbuf_end = charbuf + coding->charbuf_used;
3790   unsigned char *dst = coding->destination + coding->produced;
3791   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3792   int safe_room = 16;
3793   int bol_designation
3794     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3795        && CODING_ISO_BOL (coding));
3796   int produced_chars = 0;
3797   Lisp_Object attrs, eol_type, charset_list;
3798   int ascii_compatible;
3799   int c;
3800   int preferred_charset_id = -1;
3801
3802   CODING_GET_INFO (coding, attrs, charset_list);
3803   eol_type = CODING_ID_EOL_TYPE (coding->id);
3804   if (VECTORP (eol_type))
3805     eol_type = Qunix;
3806
3807   setup_iso_safe_charsets (attrs);
3808   /* Charset list may have been changed.  */
3809   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3810   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3811
3812   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3813
3814   while (charbuf < charbuf_end)
3815     {
3816       ASSURE_DESTINATION (safe_room);
3817
3818       if (bol_designation)
3819         {
3820           unsigned char *dst_prev = dst;
3821
3822           /* We have to produce designation sequences if any now.  */
3823           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3824           bol_designation = 0;
3825           /* We are sure that designation sequences are all ASCII bytes.  */
3826           produced_chars += dst - dst_prev;
3827         }
3828
3829       c = *charbuf++;
3830
3831       if (c < 0)
3832         {
3833           /* Handle an annotation.  */
3834           switch (*charbuf)
3835             {
3836             case CODING_ANNOTATE_COMPOSITION_MASK:
3837               /* Not yet implemented.  */
3838               break;
3839             case CODING_ANNOTATE_CHARSET_MASK:
3840               preferred_charset_id = charbuf[2];
3841               if (preferred_charset_id >= 0
3842                   && NILP (Fmemq (make_number (preferred_charset_id),
3843                                   charset_list)))
3844                 preferred_charset_id = -1;
3845               break;
3846             default:
3847               abort ();
3848             }
3849           charbuf += -c - 1;
3850           continue;
3851         }
3852
3853       /* Now encode the character C.  */
3854       if (c < 0x20 || c == 0x7F)
3855         {
3856           if (c == '\n'
3857               || (c == '\r' && EQ (eol_type, Qmac)))
3858             {
3859               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3860                 ENCODE_RESET_PLANE_AND_REGISTER ();
3861               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3862                 {
3863                   int i;
3864
3865                   for (i = 0; i < 4; i++)
3866                     CODING_ISO_DESIGNATION (coding, i)
3867                       = CODING_ISO_INITIAL (coding, i);
3868                 }
3869               bol_designation
3870                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3871             }
3872           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3873             ENCODE_RESET_PLANE_AND_REGISTER ();
3874           EMIT_ONE_ASCII_BYTE (c);
3875         }
3876       else if (ASCII_CHAR_P (c))
3877         {
3878           if (ascii_compatible)
3879             EMIT_ONE_ASCII_BYTE (c);
3880           else
3881             {
3882               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3883               ENCODE_ISO_CHARACTER (charset, c);
3884             }
3885         }
3886       else if (CHAR_BYTE8_P (c))
3887         {
3888           c = CHAR_TO_BYTE8 (c);
3889           EMIT_ONE_BYTE (c);
3890         }
3891       else
3892         {
3893           struct charset *charset;
3894
3895           if (preferred_charset_id >= 0)
3896             {
3897               charset = CHARSET_FROM_ID (preferred_charset_id);
3898               if (! CHAR_CHARSET_P (c, charset))
3899                 charset = char_charset (c, charset_list, NULL);
3900             }
3901           else
3902             charset = char_charset (c, charset_list, NULL);
3903           if (!charset)
3904             {
3905               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3906                 {
3907                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3908                   charset = CHARSET_FROM_ID (charset_ascii);
3909                 }
3910               else
3911                 {
3912                   c = coding->default_char;
3913                   charset = char_charset (c, charset_list, NULL);
3914                 }
3915             }
3916           ENCODE_ISO_CHARACTER (charset, c);
3917         }
3918     }
3919
3920   if (coding->mode & CODING_MODE_LAST_BLOCK
3921       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3922     {
3923       ASSURE_DESTINATION (safe_room);
3924       ENCODE_RESET_PLANE_AND_REGISTER ();
3925     }
3926   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3927   CODING_ISO_BOL (coding) = bol_designation;
3928   coding->produced_char += produced_chars;
3929   coding->produced = dst - coding->destination;
3930   return 0;
3931 }
3932
3933 \f
3934 /*** 8,9. SJIS and BIG5 handlers ***/
3935
3936 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3937    quite widely.  So, for the moment, Emacs supports them in the bare
3938    C code.  But, in the future, they may be supported only by CCL.  */
3939
3940 /* SJIS is a coding system encoding three character sets: ASCII, right
3941    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3942    as is.  A character of charset katakana-jisx0201 is encoded by
3943    "position-code + 0x80".  A character of charset japanese-jisx0208
3944    is encoded in 2-byte but two position-codes are divided and shifted
3945    so that it fit in the range below.
3946
3947    --- CODE RANGE of SJIS ---
3948    (character set)      (range)
3949    ASCII                0x00 .. 0x7F
3950    KATAKANA-JISX0201    0xA0 .. 0xDF
3951    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3952             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3953    -------------------------------
3954
3955 */
3956
3957 /* BIG5 is a coding system encoding two character sets: ASCII and
3958    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3959    character set and is encoded in two-byte.
3960
3961    --- CODE RANGE of BIG5 ---
3962    (character set)      (range)
3963    ASCII                0x00 .. 0x7F
3964    Big5 (1st byte)      0xA1 .. 0xFE
3965         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3966    --------------------------
3967
3968   */
3969
3970 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3971    Check if a text is encoded in SJIS.  If it is, return
3972    CATEGORY_MASK_SJIS, else return 0.  */
3973
3974 static int
3975 detect_coding_sjis (coding, detect_info)
3976      struct coding_system *coding;
3977      struct coding_detection_info *detect_info;
3978 {
3979   const unsigned char *src = coding->source, *src_base;
3980   const unsigned char *src_end = coding->source + coding->src_bytes;
3981   int multibytep = coding->src_multibyte;
3982   int consumed_chars = 0;
3983   int found = 0;
3984   int c;
3985
3986   detect_info->checked |= CATEGORY_MASK_SJIS;
3987   /* A coding system of this category is always ASCII compatible.  */
3988   src += coding->head_ascii;
3989
3990   while (1)
3991     {
3992       src_base = src;
3993       ONE_MORE_BYTE (c);
3994       if (c < 0x80)
3995         continue;
3996       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3997         {
3998           ONE_MORE_BYTE (c);
3999           if (c < 0x40 || c == 0x7F || c > 0xFC)
4000             break;
4001           found = CATEGORY_MASK_SJIS;
4002         }
4003       else if (c >= 0xA0 && c < 0xE0)
4004         found = CATEGORY_MASK_SJIS;
4005       else
4006         break;
4007     }
4008   detect_info->rejected |= CATEGORY_MASK_SJIS;
4009   return 0;
4010
4011  no_more_source:
4012   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4013     {
4014       detect_info->rejected |= CATEGORY_MASK_SJIS;
4015       return 0;
4016     }
4017   detect_info->found |= found;
4018   return 1;
4019 }
4020
4021 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4022    Check if a text is encoded in BIG5.  If it is, return
4023    CATEGORY_MASK_BIG5, else return 0.  */
4024
4025 static int
4026 detect_coding_big5 (coding, detect_info)
4027      struct coding_system *coding;
4028      struct coding_detection_info *detect_info;
4029 {
4030   const unsigned char *src = coding->source, *src_base;
4031   const unsigned char *src_end = coding->source + coding->src_bytes;
4032   int multibytep = coding->src_multibyte;
4033   int consumed_chars = 0;
4034   int found = 0;
4035   int c;
4036
4037   detect_info->checked |= CATEGORY_MASK_BIG5;
4038   /* A coding system of this category is always ASCII compatible.  */
4039   src += coding->head_ascii;
4040
4041   while (1)
4042     {
4043       src_base = src;
4044       ONE_MORE_BYTE (c);
4045       if (c < 0x80)
4046         continue;
4047       if (c >= 0xA1)
4048         {
4049           ONE_MORE_BYTE (c);
4050           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4051             return 0;
4052           found = CATEGORY_MASK_BIG5;
4053         }
4054       else
4055         break;
4056     }
4057   detect_info->rejected |= CATEGORY_MASK_BIG5;
4058   return 0;
4059
4060  no_more_source:
4061   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4062     {
4063       detect_info->rejected |= CATEGORY_MASK_BIG5;
4064       return 0;
4065     }
4066   detect_info->found |= found;
4067   return 1;
4068 }
4069
4070 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4071    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4072
4073 static void
4074 decode_coding_sjis (coding)
4075      struct coding_system *coding;
4076 {
4077   const unsigned char *src = coding->source + coding->consumed;
4078   const unsigned char *src_end = coding->source + coding->src_bytes;
4079   const unsigned char *src_base;
4080   int *charbuf = coding->charbuf + coding->charbuf_used;
4081   int *charbuf_end
4082     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4083   int consumed_chars = 0, consumed_chars_base;
4084   int multibytep = coding->src_multibyte;
4085   struct charset *charset_roman, *charset_kanji, *charset_kana;
4086   struct charset *charset_kanji2;
4087   Lisp_Object attrs, charset_list, val;
4088   int char_offset = coding->produced_char;
4089   int last_offset = char_offset;
4090   int last_id = charset_ascii;
4091
4092   CODING_GET_INFO (coding, attrs, charset_list);
4093
4094   val = charset_list;
4095   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4096   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4097   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4098   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4099
4100   while (1)
4101     {
4102       int c, c1;
4103       struct charset *charset;
4104
4105       src_base = src;
4106       consumed_chars_base = consumed_chars;
4107
4108       if (charbuf >= charbuf_end)
4109         break;
4110
4111       ONE_MORE_BYTE (c);
4112       if (c < 0)
4113         goto invalid_code;
4114       if (c < 0x80)
4115         charset = charset_roman;
4116       else if (c == 0x80 || c == 0xA0)
4117         goto invalid_code;
4118       else if (c >= 0xA1 && c <= 0xDF)
4119         {
4120           /* SJIS -> JISX0201-Kana */
4121           c &= 0x7F;
4122           charset = charset_kana;
4123         }
4124       else if (c <= 0xEF)
4125         {
4126           /* SJIS -> JISX0208 */
4127           ONE_MORE_BYTE (c1);
4128           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4129             goto invalid_code;
4130           c = (c << 8) | c1;
4131           SJIS_TO_JIS (c);
4132           charset = charset_kanji;
4133         }
4134       else if (c <= 0xFC && charset_kanji2)
4135         {
4136           /* SJIS -> JISX0213-2 */
4137           ONE_MORE_BYTE (c1);
4138           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4139             goto invalid_code;
4140           c = (c << 8) | c1;
4141           SJIS_TO_JIS2 (c);
4142           charset = charset_kanji2;
4143         }
4144       else
4145         goto invalid_code;
4146       if (charset->id != charset_ascii
4147           && last_id != charset->id)
4148         {
4149           if (last_id != charset_ascii)
4150             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4151           last_id = charset->id;
4152           last_offset = char_offset;
4153         }
4154       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4155       *charbuf++ = c;
4156       char_offset++;
4157       continue;
4158
4159     invalid_code:
4160       src = src_base;
4161       consumed_chars = consumed_chars_base;
4162       ONE_MORE_BYTE (c);
4163       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4164       char_offset++;
4165       coding->errors++;
4166     }
4167
4168  no_more_source:
4169   if (last_id != charset_ascii)
4170     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4171   coding->consumed_char += consumed_chars_base;
4172   coding->consumed = src_base - coding->source;
4173   coding->charbuf_used = charbuf - coding->charbuf;
4174 }
4175
4176 static void
4177 decode_coding_big5 (coding)
4178      struct coding_system *coding;
4179 {
4180   const unsigned char *src = coding->source + coding->consumed;
4181   const unsigned char *src_end = coding->source + coding->src_bytes;
4182   const unsigned char *src_base;
4183   int *charbuf = coding->charbuf + coding->charbuf_used;
4184   int *charbuf_end
4185     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4186   int consumed_chars = 0, consumed_chars_base;
4187   int multibytep = coding->src_multibyte;
4188   struct charset *charset_roman, *charset_big5;
4189   Lisp_Object attrs, charset_list, val;
4190   int char_offset = coding->produced_char;
4191   int last_offset = char_offset;
4192   int last_id = charset_ascii;
4193
4194   CODING_GET_INFO (coding, attrs, charset_list);
4195   val = charset_list;
4196   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4197   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4198
4199   while (1)
4200     {
4201       int c, c1;
4202       struct charset *charset;
4203
4204       src_base = src;
4205       consumed_chars_base = consumed_chars;
4206
4207       if (charbuf >= charbuf_end)
4208         break;
4209
4210       ONE_MORE_BYTE (c);
4211
4212       if (c < 0)
4213         goto invalid_code;
4214       if (c < 0x80)
4215         charset = charset_roman;
4216       else
4217         {
4218           /* BIG5 -> Big5 */
4219           if (c < 0xA1 || c > 0xFE)
4220             goto invalid_code;
4221           ONE_MORE_BYTE (c1);
4222           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4223             goto invalid_code;
4224           c = c << 8 | c1;
4225           charset = charset_big5;
4226         }
4227       if (charset->id != charset_ascii
4228           && last_id != charset->id)
4229         {
4230           if (last_id != charset_ascii)
4231             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4232           last_id = charset->id;
4233           last_offset = char_offset;
4234         }
4235       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4236       *charbuf++ = c;
4237       char_offset++;
4238       continue;
4239
4240     invalid_code:
4241       src = src_base;
4242       consumed_chars = consumed_chars_base;
4243       ONE_MORE_BYTE (c);
4244       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4245       char_offset++;
4246       coding->errors++;
4247     }
4248
4249  no_more_source:
4250   if (last_id != charset_ascii)
4251     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4252   coding->consumed_char += consumed_chars_base;
4253   coding->consumed = src_base - coding->source;
4254   coding->charbuf_used = charbuf - coding->charbuf;
4255 }
4256
4257 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4258    This function can encode charsets `ascii', `katakana-jisx0201',
4259    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4260    are sure that all these charsets are registered as official charset
4261    (i.e. do not have extended leading-codes).  Characters of other
4262    charsets are produced without any encoding.  If SJIS_P is 1, encode
4263    SJIS text, else encode BIG5 text.  */
4264
4265 static int
4266 encode_coding_sjis (coding)
4267      struct coding_system *coding;
4268 {
4269   int multibytep = coding->dst_multibyte;
4270   int *charbuf = coding->charbuf;
4271   int *charbuf_end = charbuf + coding->charbuf_used;
4272   unsigned char *dst = coding->destination + coding->produced;
4273   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4274   int safe_room = 4;
4275   int produced_chars = 0;
4276   Lisp_Object attrs, charset_list, val;
4277   int ascii_compatible;
4278   struct charset *charset_roman, *charset_kanji, *charset_kana;
4279   struct charset *charset_kanji2;
4280   int c;
4281
4282   CODING_GET_INFO (coding, attrs, charset_list);
4283   val = charset_list;
4284   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4285   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4286   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4287   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4288
4289   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4290
4291   while (charbuf < charbuf_end)
4292     {
4293       ASSURE_DESTINATION (safe_room);
4294       c = *charbuf++;
4295       /* Now encode the character C.  */
4296       if (ASCII_CHAR_P (c) && ascii_compatible)
4297         EMIT_ONE_ASCII_BYTE (c);
4298       else if (CHAR_BYTE8_P (c))
4299         {
4300           c = CHAR_TO_BYTE8 (c);
4301           EMIT_ONE_BYTE (c);
4302         }
4303       else
4304         {
4305           unsigned code;
4306           struct charset *charset = char_charset (c, charset_list, &code);
4307
4308           if (!charset)
4309             {
4310               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4311                 {
4312                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4313                   charset = CHARSET_FROM_ID (charset_ascii);
4314                 }
4315               else
4316                 {
4317                   c = coding->default_char;
4318                   charset = char_charset (c, charset_list, &code);
4319                 }
4320             }
4321           if (code == CHARSET_INVALID_CODE (charset))
4322             abort ();
4323           if (charset == charset_kanji)
4324             {
4325               int c1, c2;
4326               JIS_TO_SJIS (code);
4327               c1 = code >> 8, c2 = code & 0xFF;
4328               EMIT_TWO_BYTES (c1, c2);
4329             }
4330           else if (charset == charset_kana)
4331             EMIT_ONE_BYTE (code | 0x80);
4332           else if (charset_kanji2 && charset == charset_kanji2)
4333             {
4334               int c1, c2;
4335
4336               c1 = code >> 8;
4337               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4338                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4339                 {
4340                   JIS_TO_SJIS2 (code);
4341                   c1 = code >> 8, c2 = code & 0xFF;
4342                   EMIT_TWO_BYTES (c1, c2);
4343                 }
4344               else
4345                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4346             }
4347           else
4348             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4349         }
4350     }
4351   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4352   coding->produced_char += produced_chars;
4353   coding->produced = dst - coding->destination;
4354   return 0;
4355 }
4356
4357 static int
4358 encode_coding_big5 (coding)
4359      struct coding_system *coding;
4360 {
4361   int multibytep = coding->dst_multibyte;
4362   int *charbuf = coding->charbuf;
4363   int *charbuf_end = charbuf + coding->charbuf_used;
4364   unsigned char *dst = coding->destination + coding->produced;
4365   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4366   int safe_room = 4;
4367   int produced_chars = 0;
4368   Lisp_Object attrs, charset_list, val;
4369   int ascii_compatible;
4370   struct charset *charset_roman, *charset_big5;
4371   int c;
4372
4373   CODING_GET_INFO (coding, attrs, charset_list);
4374   val = charset_list;
4375   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4376   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4377   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4378
4379   while (charbuf < charbuf_end)
4380     {
4381       ASSURE_DESTINATION (safe_room);
4382       c = *charbuf++;
4383       /* Now encode the character C.  */
4384       if (ASCII_CHAR_P (c) && ascii_compatible)
4385         EMIT_ONE_ASCII_BYTE (c);
4386       else if (CHAR_BYTE8_P (c))
4387         {
4388           c = CHAR_TO_BYTE8 (c);
4389           EMIT_ONE_BYTE (c);
4390         }
4391       else
4392         {
4393           unsigned code;
4394           struct charset *charset = char_charset (c, charset_list, &code);
4395
4396           if (! charset)
4397             {
4398               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4399                 {
4400                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4401                   charset = CHARSET_FROM_ID (charset_ascii);
4402                 }
4403               else
4404                 {
4405                   c = coding->default_char;
4406                   charset = char_charset (c, charset_list, &code);
4407                 }
4408             }
4409           if (code == CHARSET_INVALID_CODE (charset))
4410             abort ();
4411           if (charset == charset_big5)
4412             {
4413               int c1, c2;
4414
4415               c1 = code >> 8, c2 = code & 0xFF;
4416               EMIT_TWO_BYTES (c1, c2);
4417             }
4418           else
4419             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4420         }
4421     }
4422   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4423   coding->produced_char += produced_chars;
4424   coding->produced = dst - coding->destination;
4425   return 0;
4426 }
4427
4428 \f
4429 /*** 10. CCL handlers ***/
4430
4431 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4432    Check if a text is encoded in a coding system of which
4433    encoder/decoder are written in CCL program.  If it is, return
4434    CATEGORY_MASK_CCL, else return 0.  */
4435
4436 static int
4437 detect_coding_ccl (coding, detect_info)
4438      struct coding_system *coding;
4439      struct coding_detection_info *detect_info;
4440 {
4441   const unsigned char *src = coding->source, *src_base;
4442   const unsigned char *src_end = coding->source + coding->src_bytes;
4443   int multibytep = coding->src_multibyte;
4444   int consumed_chars = 0;
4445   int found = 0;
4446   unsigned char *valids;
4447   int head_ascii = coding->head_ascii;
4448   Lisp_Object attrs;
4449
4450   detect_info->checked |= CATEGORY_MASK_CCL;
4451
4452   coding = &coding_categories[coding_category_ccl];
4453   valids = CODING_CCL_VALIDS (coding);
4454   attrs = CODING_ID_ATTRS (coding->id);
4455   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4456     src += head_ascii;
4457
4458   while (1)
4459     {
4460       int c;
4461
4462       src_base = src;
4463       ONE_MORE_BYTE (c);
4464       if (c < 0 || ! valids[c])
4465         break;
4466       if ((valids[c] > 1))
4467         found = CATEGORY_MASK_CCL;
4468     }
4469   detect_info->rejected |= CATEGORY_MASK_CCL;
4470   return 0;
4471
4472  no_more_source:
4473   detect_info->found |= found;
4474   return 1;
4475 }
4476
4477 static void
4478 decode_coding_ccl (coding)
4479      struct coding_system *coding;
4480 {
4481   const unsigned char *src = coding->source + coding->consumed;
4482   const unsigned char *src_end = coding->source + coding->src_bytes;
4483   int *charbuf = coding->charbuf + coding->charbuf_used;
4484   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4485   int consumed_chars = 0;
4486   int multibytep = coding->src_multibyte;
4487   struct ccl_program ccl;
4488   int source_charbuf[1024];
4489   int source_byteidx[1024];
4490   Lisp_Object attrs, charset_list;
4491
4492   CODING_GET_INFO (coding, attrs, charset_list);
4493   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4494
4495   while (src < src_end)
4496     {
4497       const unsigned char *p = src;
4498       int *source, *source_end;
4499       int i = 0;
4500
4501       if (multibytep)
4502         while (i < 1024 && p < src_end)
4503           {
4504             source_byteidx[i] = p - src;
4505             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4506           }
4507       else
4508         while (i < 1024 && p < src_end)
4509           source_charbuf[i++] = *p++;
4510
4511       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4512         ccl.last_block = 1;
4513
4514       source = source_charbuf;
4515       source_end = source + i;
4516       while (source < source_end)
4517         {
4518           ccl_driver (&ccl, source, charbuf,
4519                       source_end - source, charbuf_end - charbuf,
4520                       charset_list);
4521           source += ccl.consumed;
4522           charbuf += ccl.produced;
4523           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4524             break;
4525         }
4526       if (source < source_end)
4527         src += source_byteidx[source - source_charbuf];
4528       else
4529         src = p;
4530       consumed_chars += source - source_charbuf;
4531
4532       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4533           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4534         break;
4535     }
4536
4537   switch (ccl.status)
4538     {
4539     case CCL_STAT_SUSPEND_BY_SRC:
4540       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4541       break;
4542     case CCL_STAT_SUSPEND_BY_DST:
4543       break;
4544     case CCL_STAT_QUIT:
4545     case CCL_STAT_INVALID_CMD:
4546       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4547       break;
4548     default:
4549       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4550       break;
4551     }
4552   coding->consumed_char += consumed_chars;
4553   coding->consumed = src - coding->source;
4554   coding->charbuf_used = charbuf - coding->charbuf;
4555 }
4556
4557 static int
4558 encode_coding_ccl (coding)
4559      struct coding_system *coding;
4560 {
4561   struct ccl_program ccl;
4562   int multibytep = coding->dst_multibyte;
4563   int *charbuf = coding->charbuf;
4564   int *charbuf_end = charbuf + coding->charbuf_used;
4565   unsigned char *dst = coding->destination + coding->produced;
4566   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4567   int destination_charbuf[1024];
4568   int i, produced_chars = 0;
4569   Lisp_Object attrs, charset_list;
4570
4571   CODING_GET_INFO (coding, attrs, charset_list);
4572   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4573
4574   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4575   ccl.dst_multibyte = coding->dst_multibyte;
4576
4577   while (charbuf < charbuf_end)
4578     {
4579       ccl_driver (&ccl, charbuf, destination_charbuf,
4580                   charbuf_end - charbuf, 1024, charset_list);
4581       if (multibytep)
4582         {
4583           ASSURE_DESTINATION (ccl.produced * 2);
4584           for (i = 0; i < ccl.produced; i++)
4585             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4586         }
4587       else
4588         {
4589           ASSURE_DESTINATION (ccl.produced);
4590           for (i = 0; i < ccl.produced; i++)
4591             *dst++ = destination_charbuf[i] & 0xFF;
4592           produced_chars += ccl.produced;
4593         }
4594       charbuf += ccl.consumed;
4595       if (ccl.status == CCL_STAT_QUIT
4596           || ccl.status == CCL_STAT_INVALID_CMD)
4597         break;
4598     }
4599
4600   switch (ccl.status)
4601     {
4602     case CCL_STAT_SUSPEND_BY_SRC:
4603       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4604       break;
4605     case CCL_STAT_SUSPEND_BY_DST:
4606       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4607       break;
4608     case CCL_STAT_QUIT:
4609     case CCL_STAT_INVALID_CMD:
4610       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4611       break;
4612     default:
4613       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4614       break;
4615     }
4616
4617   coding->produced_char += produced_chars;
4618   coding->produced = dst - coding->destination;
4619   return 0;
4620 }
4621
4622
4623 \f
4624 /*** 10, 11. no-conversion handlers ***/
4625
4626 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4627
4628 static void
4629 decode_coding_raw_text (coding)
4630      struct coding_system *coding;
4631 {
4632   coding->chars_at_source = 1;
4633   coding->consumed_char = 0;
4634   coding->consumed = 0;
4635   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4636 }
4637
4638 static int
4639 encode_coding_raw_text (coding)
4640      struct coding_system *coding;
4641 {
4642   int multibytep = coding->dst_multibyte;
4643   int *charbuf = coding->charbuf;
4644   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4645   unsigned char *dst = coding->destination + coding->produced;
4646   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4647   int produced_chars = 0;
4648   int c;
4649
4650   if (multibytep)
4651     {
4652       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4653
4654       if (coding->src_multibyte)
4655         while (charbuf < charbuf_end)
4656           {
4657             ASSURE_DESTINATION (safe_room);
4658             c = *charbuf++;
4659             if (ASCII_CHAR_P (c))
4660               EMIT_ONE_ASCII_BYTE (c);
4661             else if (CHAR_BYTE8_P (c))
4662               {
4663                 c = CHAR_TO_BYTE8 (c);
4664                 EMIT_ONE_BYTE (c);
4665               }
4666             else
4667               {
4668                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4669
4670                 CHAR_STRING_ADVANCE (c, p1);
4671                 while (p0 < p1)
4672                   {
4673                     EMIT_ONE_BYTE (*p0);
4674                     p0++;
4675                   }
4676               }
4677           }
4678       else
4679         while (charbuf < charbuf_end)
4680           {
4681             ASSURE_DESTINATION (safe_room);
4682             c = *charbuf++;
4683             EMIT_ONE_BYTE (c);
4684           }
4685     }
4686   else
4687     {
4688       if (coding->src_multibyte)
4689         {
4690           int safe_room = MAX_MULTIBYTE_LENGTH;
4691
4692           while (charbuf < charbuf_end)
4693             {
4694               ASSURE_DESTINATION (safe_room);
4695               c = *charbuf++;
4696               if (ASCII_CHAR_P (c))
4697                 *dst++ = c;
4698               else if (CHAR_BYTE8_P (c))
4699                 *dst++ = CHAR_TO_BYTE8 (c);
4700               else
4701                 CHAR_STRING_ADVANCE (c, dst);
4702               produced_chars++;
4703             }
4704         }
4705       else
4706         {
4707           ASSURE_DESTINATION (charbuf_end - charbuf);
4708           while (charbuf < charbuf_end && dst < dst_end)
4709             *dst++ = *charbuf++;
4710           produced_chars = dst - (coding->destination + coding->dst_bytes);
4711         }
4712     }
4713   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4714   coding->produced_char += produced_chars;
4715   coding->produced = dst - coding->destination;
4716   return 0;
4717 }
4718
4719 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4720    Check if a text is encoded in a charset-based coding system.  If it
4721    is, return 1, else return 0.  */
4722
4723 static int
4724 detect_coding_charset (coding, detect_info)
4725      struct coding_system *coding;
4726      struct coding_detection_info *detect_info;
4727 {
4728   const unsigned char *src = coding->source, *src_base;
4729   const unsigned char *src_end = coding->source + coding->src_bytes;
4730   int multibytep = coding->src_multibyte;
4731   int consumed_chars = 0;
4732   Lisp_Object attrs, valids;
4733   int found = 0;
4734
4735   detect_info->checked |= CATEGORY_MASK_CHARSET;
4736
4737   coding = &coding_categories[coding_category_charset];
4738   attrs = CODING_ID_ATTRS (coding->id);
4739   valids = AREF (attrs, coding_attr_charset_valids);
4740
4741   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4742     src += coding->head_ascii;
4743
4744   while (1)
4745     {
4746       int c;
4747
4748       src_base = src;
4749       ONE_MORE_BYTE (c);
4750       if (c < 0)
4751         continue;
4752       if (NILP (AREF (valids, c)))
4753         break;
4754       if (c >= 0x80)
4755         found = CATEGORY_MASK_CHARSET;
4756     }
4757   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4758   return 0;
4759
4760  no_more_source:
4761   detect_info->found |= found;
4762   return 1;
4763 }
4764
4765 static void
4766 decode_coding_charset (coding)
4767      struct coding_system *coding;
4768 {
4769   const unsigned char *src = coding->source + coding->consumed;
4770   const unsigned char *src_end = coding->source + coding->src_bytes;
4771   const unsigned char *src_base;
4772   int *charbuf = coding->charbuf + coding->charbuf_used;
4773   int *charbuf_end
4774     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4775   int consumed_chars = 0, consumed_chars_base;
4776   int multibytep = coding->src_multibyte;
4777   Lisp_Object attrs, charset_list, valids;
4778   int char_offset = coding->produced_char;
4779   int last_offset = char_offset;
4780   int last_id = charset_ascii;
4781
4782   CODING_GET_INFO (coding, attrs, charset_list);
4783   valids = AREF (attrs, coding_attr_charset_valids);
4784
4785   while (1)
4786     {
4787       int c;
4788       Lisp_Object val;
4789       struct charset *charset;
4790       int dim;
4791       int len = 1;
4792       unsigned code;
4793
4794       src_base = src;
4795       consumed_chars_base = consumed_chars;
4796
4797       if (charbuf >= charbuf_end)
4798         break;
4799
4800       ONE_MORE_BYTE (c);
4801       if (c < 0)
4802         goto invalid_code;
4803       code = c;
4804
4805       val = AREF (valids, c);
4806       if (NILP (val))
4807         goto invalid_code;
4808       if (INTEGERP (val))
4809         {
4810           charset = CHARSET_FROM_ID (XFASTINT (val));
4811           dim = CHARSET_DIMENSION (charset);
4812           while (len < dim)
4813             {
4814               ONE_MORE_BYTE (c);
4815               code = (code << 8) | c;
4816               len++;
4817             }
4818           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4819                               charset, code, c);
4820         }
4821       else
4822         {
4823           /* VAL is a list of charset IDs.  It is assured that the
4824              list is sorted by charset dimensions (smaller one
4825              comes first).  */
4826           while (CONSP (val))
4827             {
4828               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4829               dim = CHARSET_DIMENSION (charset);
4830               while (len < dim)
4831                 {
4832                   ONE_MORE_BYTE (c);
4833                   code = (code << 8) | c;
4834                   len++;
4835                 }
4836               CODING_DECODE_CHAR (coding, src, src_base,
4837                                   src_end, charset, code, c);
4838               if (c >= 0)
4839                 break;
4840               val = XCDR (val);
4841             }
4842         }
4843       if (c < 0)
4844         goto invalid_code;
4845       if (charset->id != charset_ascii
4846           && last_id != charset->id)
4847         {
4848           if (last_id != charset_ascii)
4849             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4850           last_id = charset->id;
4851           last_offset = char_offset;
4852         }
4853
4854       *charbuf++ = c;
4855       char_offset++;
4856       continue;
4857
4858     invalid_code:
4859       src = src_base;
4860       consumed_chars = consumed_chars_base;
4861       ONE_MORE_BYTE (c);
4862       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4863       char_offset++;
4864       coding->errors++;
4865     }
4866
4867  no_more_source:
4868   if (last_id != charset_ascii)
4869     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4870   coding->consumed_char += consumed_chars_base;
4871   coding->consumed = src_base - coding->source;
4872   coding->charbuf_used = charbuf - coding->charbuf;
4873 }
4874
4875 static int
4876 encode_coding_charset (coding)
4877      struct coding_system *coding;
4878 {
4879   int multibytep = coding->dst_multibyte;
4880   int *charbuf = coding->charbuf;
4881   int *charbuf_end = charbuf + coding->charbuf_used;
4882   unsigned char *dst = coding->destination + coding->produced;
4883   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4884   int safe_room = MAX_MULTIBYTE_LENGTH;
4885   int produced_chars = 0;
4886   Lisp_Object attrs, charset_list;
4887   int ascii_compatible;
4888   int c;
4889
4890   CODING_GET_INFO (coding, attrs, charset_list);
4891   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4892
4893   while (charbuf < charbuf_end)
4894     {
4895       struct charset *charset;
4896       unsigned code;
4897
4898       ASSURE_DESTINATION (safe_room);
4899       c = *charbuf++;
4900       if (ascii_compatible && ASCII_CHAR_P (c))
4901         EMIT_ONE_ASCII_BYTE (c);
4902       else if (CHAR_BYTE8_P (c))
4903         {
4904           c = CHAR_TO_BYTE8 (c);
4905           EMIT_ONE_BYTE (c);
4906         }
4907       else
4908         {
4909           charset = char_charset (c, charset_list, &code);
4910           if (charset)
4911             {
4912               if (CHARSET_DIMENSION (charset) == 1)
4913                 EMIT_ONE_BYTE (code);
4914               else if (CHARSET_DIMENSION (charset) == 2)
4915                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4916               else if (CHARSET_DIMENSION (charset) == 3)
4917                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4918               else
4919                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4920                                  (code >> 8) & 0xFF, code & 0xFF);
4921             }
4922           else
4923             {
4924               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4925                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4926               else
4927                 c = coding->default_char;
4928               EMIT_ONE_BYTE (c);
4929             }
4930         }
4931     }
4932
4933   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4934   coding->produced_char += produced_chars;
4935   coding->produced = dst - coding->destination;
4936   return 0;
4937 }
4938
4939 \f
4940 /*** 7. C library functions ***/
4941
4942 /* Setup coding context CODING from information about CODING_SYSTEM.
4943    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4944    CODING_SYSTEM is invalid, signal an error.  */
4945
4946 void
4947 setup_coding_system (coding_system, coding)
4948      Lisp_Object coding_system;
4949      struct coding_system *coding;
4950 {
4951   Lisp_Object attrs;
4952   Lisp_Object eol_type;
4953   Lisp_Object coding_type;
4954   Lisp_Object val;
4955
4956   if (NILP (coding_system))
4957     coding_system = Qundecided;
4958
4959   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4960
4961   attrs = CODING_ID_ATTRS (coding->id);
4962   eol_type = CODING_ID_EOL_TYPE (coding->id);
4963
4964   coding->mode = 0;
4965   coding->head_ascii = -1;
4966   coding->common_flags
4967     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4968   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4969     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4970   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4971     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4972   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4973     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4974
4975   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4976   coding->max_charset_id = SCHARS (val) - 1;
4977   coding->safe_charsets = (char *) SDATA (val);
4978   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4979
4980   coding_type = CODING_ATTR_TYPE (attrs);
4981   if (EQ (coding_type, Qundecided))
4982     {
4983       coding->detector = NULL;
4984       coding->decoder = decode_coding_raw_text;
4985       coding->encoder = encode_coding_raw_text;
4986       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4987     }
4988   else if (EQ (coding_type, Qiso_2022))
4989     {
4990       int i;
4991       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4992
4993       /* Invoke graphic register 0 to plane 0.  */
4994       CODING_ISO_INVOCATION (coding, 0) = 0;
4995       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4996       CODING_ISO_INVOCATION (coding, 1)
4997         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4998       /* Setup the initial status of designation.  */
4999       for (i = 0; i < 4; i++)
5000         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
5001       /* Not single shifting initially.  */
5002       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
5003       /* Beginning of buffer should also be regarded as bol. */
5004       CODING_ISO_BOL (coding) = 1;
5005       coding->detector = detect_coding_iso_2022;
5006       coding->decoder = decode_coding_iso_2022;
5007       coding->encoder = encode_coding_iso_2022;
5008       if (flags & CODING_ISO_FLAG_SAFE)
5009         coding->mode |= CODING_MODE_SAFE_ENCODING;
5010       coding->common_flags
5011         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5012             | CODING_REQUIRE_FLUSHING_MASK);
5013       if (flags & CODING_ISO_FLAG_COMPOSITION)
5014         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5015       if (flags & CODING_ISO_FLAG_DESIGNATION)
5016         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5017       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5018         {
5019           setup_iso_safe_charsets (attrs);
5020           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5021           coding->max_charset_id = SCHARS (val) - 1;
5022           coding->safe_charsets = (char *) SDATA (val);
5023         }
5024       CODING_ISO_FLAGS (coding) = flags;
5025     }
5026   else if (EQ (coding_type, Qcharset))
5027     {
5028       coding->detector = detect_coding_charset;
5029       coding->decoder = decode_coding_charset;
5030       coding->encoder = encode_coding_charset;
5031       coding->common_flags
5032         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5033     }
5034   else if (EQ (coding_type, Qutf_8))
5035     {
5036       coding->detector = detect_coding_utf_8;
5037       coding->decoder = decode_coding_utf_8;
5038       coding->encoder = encode_coding_utf_8;
5039       coding->common_flags
5040         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5041     }
5042   else if (EQ (coding_type, Qutf_16))
5043     {
5044       val = AREF (attrs, coding_attr_utf_16_bom);
5045       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5046                                     : EQ (val, Qt) ? utf_16_with_bom
5047                                     : utf_16_without_bom);
5048       val = AREF (attrs, coding_attr_utf_16_endian);
5049       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5050                                        : utf_16_little_endian);
5051       CODING_UTF_16_SURROGATE (coding) = 0;
5052       coding->detector = detect_coding_utf_16;
5053       coding->decoder = decode_coding_utf_16;
5054       coding->encoder = encode_coding_utf_16;
5055       coding->common_flags
5056         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5057       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5058         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5059     }
5060   else if (EQ (coding_type, Qccl))
5061     {
5062       coding->detector = detect_coding_ccl;
5063       coding->decoder = decode_coding_ccl;
5064       coding->encoder = encode_coding_ccl;
5065       coding->common_flags
5066         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5067             | CODING_REQUIRE_FLUSHING_MASK);
5068     }
5069   else if (EQ (coding_type, Qemacs_mule))
5070     {
5071       coding->detector = detect_coding_emacs_mule;
5072       coding->decoder = decode_coding_emacs_mule;
5073       coding->encoder = encode_coding_emacs_mule;
5074       coding->common_flags
5075         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5076       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5077           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5078         {
5079           Lisp_Object tail, safe_charsets;
5080           int max_charset_id = 0;
5081
5082           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5083                tail = XCDR (tail))
5084             if (max_charset_id < XFASTINT (XCAR (tail)))
5085               max_charset_id = XFASTINT (XCAR (tail));
5086           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5087                                         make_number (255));
5088           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5089                tail = XCDR (tail))
5090             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5091           coding->max_charset_id = max_charset_id;
5092           coding->safe_charsets = (char *) SDATA (safe_charsets);
5093         }
5094     }
5095   else if (EQ (coding_type, Qshift_jis))
5096     {
5097       coding->detector = detect_coding_sjis;
5098       coding->decoder = decode_coding_sjis;
5099       coding->encoder = encode_coding_sjis;
5100       coding->common_flags
5101         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5102     }
5103   else if (EQ (coding_type, Qbig5))
5104     {
5105       coding->detector = detect_coding_big5;
5106       coding->decoder = decode_coding_big5;
5107       coding->encoder = encode_coding_big5;
5108       coding->common_flags
5109         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5110     }
5111   else                          /* EQ (coding_type, Qraw_text) */
5112     {
5113       coding->detector = NULL;
5114       coding->decoder = decode_coding_raw_text;
5115       coding->encoder = encode_coding_raw_text;
5116       if (! EQ (eol_type, Qunix))
5117         {
5118           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5119           if (! VECTORP (eol_type))
5120             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5121         }
5122
5123     }
5124
5125   return;
5126 }
5127
5128 /* Return a list of charsets supported by CODING.  */
5129
5130 Lisp_Object
5131 coding_charset_list (coding)
5132      struct coding_system *coding;
5133 {
5134   Lisp_Object attrs, charset_list;
5135
5136   CODING_GET_INFO (coding, attrs, charset_list);
5137   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5138     {
5139       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5140
5141       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5142         charset_list = Viso_2022_charset_list;
5143     }
5144   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5145     {
5146       charset_list = Vemacs_mule_charset_list;
5147     }
5148   return charset_list;
5149 }
5150
5151
5152 /* Return raw-text or one of its subsidiaries that has the same
5153    eol_type as CODING-SYSTEM.  */
5154
5155 Lisp_Object
5156 raw_text_coding_system (coding_system)
5157      Lisp_Object coding_system;
5158 {
5159   Lisp_Object spec, attrs;
5160   Lisp_Object eol_type, raw_text_eol_type;
5161
5162   if (NILP (coding_system))
5163     return Qraw_text;
5164   spec = CODING_SYSTEM_SPEC (coding_system);
5165   attrs = AREF (spec, 0);
5166
5167   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5168     return coding_system;
5169
5170   eol_type = AREF (spec, 2);
5171   if (VECTORP (eol_type))
5172     return Qraw_text;
5173   spec = CODING_SYSTEM_SPEC (Qraw_text);
5174   raw_text_eol_type = AREF (spec, 2);
5175   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5176           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5177           : AREF (raw_text_eol_type, 2));
5178 }
5179
5180
5181 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5182    does, return one of the subsidiary that has the same eol-spec as
5183    PARENT.  Otherwise, return CODING_SYSTEM.  If PARENT is nil,
5184    inherit end-of-line format from the system's setting
5185    (system_eol_type).  */
5186
5187 Lisp_Object
5188 coding_inherit_eol_type (coding_system, parent)
5189      Lisp_Object coding_system, parent;
5190 {
5191   Lisp_Object spec, eol_type;
5192
5193   if (NILP (coding_system))
5194     coding_system = Qraw_text;
5195   spec = CODING_SYSTEM_SPEC (coding_system);
5196   eol_type = AREF (spec, 2);
5197   if (VECTORP (eol_type))
5198     {
5199       Lisp_Object parent_eol_type;
5200
5201       if (! NILP (parent))
5202         {
5203           Lisp_Object parent_spec;
5204
5205           parent_spec
5206             = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5207           parent_eol_type = AREF (parent_spec, 2);
5208         }
5209       else
5210         parent_eol_type = system_eol_type;
5211       if (EQ (parent_eol_type, Qunix))
5212         coding_system = AREF (eol_type, 0);
5213       else if (EQ (parent_eol_type, Qdos))
5214         coding_system = AREF (eol_type, 1);
5215       else if (EQ (parent_eol_type, Qmac))
5216         coding_system = AREF (eol_type, 2);
5217     }
5218   return coding_system;
5219 }
5220
5221 /* Emacs has a mechanism to automatically detect a coding system if it
5222    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5223    it's impossible to distinguish some coding systems accurately
5224    because they use the same range of codes.  So, at first, coding
5225    systems are categorized into 7, those are:
5226
5227    o coding-category-emacs-mule
5228
5229         The category for a coding system which has the same code range
5230         as Emacs' internal format.  Assigned the coding-system (Lisp
5231         symbol) `emacs-mule' by default.
5232
5233    o coding-category-sjis
5234
5235         The category for a coding system which has the same code range
5236         as SJIS.  Assigned the coding-system (Lisp
5237         symbol) `japanese-shift-jis' by default.
5238
5239    o coding-category-iso-7
5240
5241         The category for a coding system which has the same code range
5242         as ISO2022 of 7-bit environment.  This doesn't use any locking
5243         shift and single shift functions.  This can encode/decode all
5244         charsets.  Assigned the coding-system (Lisp symbol)
5245         `iso-2022-7bit' by default.
5246
5247    o coding-category-iso-7-tight
5248
5249         Same as coding-category-iso-7 except that this can
5250         encode/decode only the specified charsets.
5251
5252    o coding-category-iso-8-1
5253
5254         The category for a coding system which has the same code range
5255         as ISO2022 of 8-bit environment and graphic plane 1 used only
5256         for DIMENSION1 charset.  This doesn't use any locking shift
5257         and single shift functions.  Assigned the coding-system (Lisp
5258         symbol) `iso-latin-1' by default.
5259
5260    o coding-category-iso-8-2
5261
5262         The category for a coding system which has the same code range
5263         as ISO2022 of 8-bit environment and graphic plane 1 used only
5264         for DIMENSION2 charset.  This doesn't use any locking shift
5265         and single shift functions.  Assigned the coding-system (Lisp
5266         symbol) `japanese-iso-8bit' by default.
5267
5268    o coding-category-iso-7-else
5269
5270         The category for a coding system which has the same code range
5271         as ISO2022 of 7-bit environemnt but uses locking shift or
5272         single shift functions.  Assigned the coding-system (Lisp
5273         symbol) `iso-2022-7bit-lock' by default.
5274
5275    o coding-category-iso-8-else
5276
5277         The category for a coding system which has the same code range
5278         as ISO2022 of 8-bit environemnt but uses locking shift or
5279         single shift functions.  Assigned the coding-system (Lisp
5280         symbol) `iso-2022-8bit-ss2' by default.
5281
5282    o coding-category-big5
5283
5284         The category for a coding system which has the same code range
5285         as BIG5.  Assigned the coding-system (Lisp symbol)
5286         `cn-big5' by default.
5287
5288    o coding-category-utf-8
5289
5290         The category for a coding system which has the same code range
5291         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5292         symbol) `utf-8' by default.
5293
5294    o coding-category-utf-16-be
5295
5296         The category for a coding system in which a text has an
5297         Unicode signature (cf. Unicode Standard) in the order of BIG
5298         endian at the head.  Assigned the coding-system (Lisp symbol)
5299         `utf-16-be' by default.
5300
5301    o coding-category-utf-16-le
5302
5303         The category for a coding system in which a text has an
5304         Unicode signature (cf. Unicode Standard) in the order of
5305         LITTLE endian at the head.  Assigned the coding-system (Lisp
5306         symbol) `utf-16-le' by default.
5307
5308    o coding-category-ccl
5309
5310         The category for a coding system of which encoder/decoder is
5311         written in CCL programs.  The default value is nil, i.e., no
5312         coding system is assigned.
5313
5314    o coding-category-binary
5315
5316         The category for a coding system not categorized in any of the
5317         above.  Assigned the coding-system (Lisp symbol)
5318         `no-conversion' by default.
5319
5320    Each of them is a Lisp symbol and the value is an actual
5321    `coding-system's (this is also a Lisp symbol) assigned by a user.
5322    What Emacs does actually is to detect a category of coding system.
5323    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5324    decide only one possible category, it selects a category of the
5325    highest priority.  Priorities of categories are also specified by a
5326    user in a Lisp variable `coding-category-list'.
5327
5328 */
5329
5330 #define EOL_SEEN_NONE   0
5331 #define EOL_SEEN_LF     1
5332 #define EOL_SEEN_CR     2
5333 #define EOL_SEEN_CRLF   4
5334
5335 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5336    SOURCE is encoded.  If CATEGORY is one of
5337    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5338    two-byte, else they are encoded by one-byte.
5339
5340    Return one of EOL_SEEN_XXX.  */
5341
5342 #define MAX_EOL_CHECK_COUNT 3
5343
5344 static int
5345 detect_eol (source, src_bytes, category)
5346      const unsigned char *source;
5347      EMACS_INT src_bytes;
5348      enum coding_category category;
5349 {
5350   const unsigned char *src = source, *src_end = src + src_bytes;
5351   unsigned char c;
5352   int total  = 0;
5353   int eol_seen = EOL_SEEN_NONE;
5354
5355   if ((1 << category) & CATEGORY_MASK_UTF_16)
5356     {
5357       int msb, lsb;
5358
5359       msb = category == (coding_category_utf_16_le
5360                          | coding_category_utf_16_le_nosig);
5361       lsb = 1 - msb;
5362
5363       while (src + 1 < src_end)
5364         {
5365           c = src[lsb];
5366           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5367             {
5368               int this_eol;
5369
5370               if (c == '\n')
5371                 this_eol = EOL_SEEN_LF;
5372               else if (src + 3 >= src_end
5373                        || src[msb + 2] != 0
5374                        || src[lsb + 2] != '\n')
5375                 this_eol = EOL_SEEN_CR;
5376               else
5377                 this_eol = EOL_SEEN_CRLF;
5378
5379               if (eol_seen == EOL_SEEN_NONE)
5380                 /* This is the first end-of-line.  */
5381                 eol_seen = this_eol;
5382               else if (eol_seen != this_eol)
5383                 {
5384                   /* The found type is different from what found before.  */
5385                   eol_seen = EOL_SEEN_LF;
5386                   break;
5387                 }
5388               if (++total == MAX_EOL_CHECK_COUNT)
5389                 break;
5390             }
5391           src += 2;
5392         }
5393     }
5394   else
5395     {
5396       while (src < src_end)
5397         {
5398           c = *src++;
5399           if (c == '\n' || c == '\r')
5400             {
5401               int this_eol;
5402
5403               if (c == '\n')
5404                 this_eol = EOL_SEEN_LF;
5405               else if (src >= src_end || *src != '\n')
5406                 this_eol = EOL_SEEN_CR;
5407               else
5408                 this_eol = EOL_SEEN_CRLF, src++;
5409
5410               if (eol_seen == EOL_SEEN_NONE)
5411                 /* This is the first end-of-line.  */
5412                 eol_seen = this_eol;
5413               else if (eol_seen != this_eol)
5414                 {
5415                   /* The found type is different from what found before.  */
5416                   eol_seen = EOL_SEEN_LF;
5417                   break;
5418                 }
5419               if (++total == MAX_EOL_CHECK_COUNT)
5420                 break;
5421             }
5422         }
5423     }
5424   return eol_seen;
5425 }
5426
5427
5428 static Lisp_Object
5429 adjust_coding_eol_type (coding, eol_seen)
5430      struct coding_system *coding;
5431      int eol_seen;
5432 {
5433   Lisp_Object eol_type;
5434
5435   eol_type = CODING_ID_EOL_TYPE (coding->id);
5436   if (eol_seen & EOL_SEEN_LF)
5437     {
5438       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5439       eol_type = Qunix;
5440     }
5441   else if (eol_seen & EOL_SEEN_CRLF)
5442     {
5443       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5444       eol_type = Qdos;
5445     }
5446   else if (eol_seen & EOL_SEEN_CR)
5447     {
5448       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5449       eol_type = Qmac;
5450     }
5451   return eol_type;
5452 }
5453
5454 /* Detect how a text specified in CODING is encoded.  If a coding
5455    system is detected, update fields of CODING by the detected coding
5456    system.  */
5457
5458 void
5459 detect_coding (coding)
5460      struct coding_system *coding;
5461 {
5462   const unsigned char *src, *src_end;
5463
5464   coding->consumed = coding->consumed_char = 0;
5465   coding->produced = coding->produced_char = 0;
5466   coding_set_source (coding);
5467
5468   src_end = coding->source + coding->src_bytes;
5469
5470   /* If we have not yet decided the text encoding type, detect it
5471      now.  */
5472   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5473     {
5474       int c, i;
5475       struct coding_detection_info detect_info;
5476
5477       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5478       for (i = 0, src = coding->source; src < src_end; i++, src++)
5479         {
5480           c = *src;
5481           if (c & 0x80)
5482             break;
5483           if (c < 0x20
5484               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5485               && ! inhibit_iso_escape_detection
5486               && ! detect_info.checked)
5487             {
5488               coding->head_ascii = src - (coding->source + coding->consumed);
5489               if (detect_coding_iso_2022 (coding, &detect_info))
5490                 {
5491                   /* We have scanned the whole data.  */
5492                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5493                     /* We didn't find an 8-bit code.  */
5494                     src = src_end;
5495                   break;
5496                 }
5497             }
5498         }
5499       coding->head_ascii = src - (coding->source + coding->consumed);
5500
5501       if (coding->head_ascii < coding->src_bytes
5502           || detect_info.found)
5503         {
5504           enum coding_category category;
5505           struct coding_system *this;
5506
5507           if (coding->head_ascii == coding->src_bytes)
5508             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5509             for (i = 0; i < coding_category_raw_text; i++)
5510               {
5511                 category = coding_priorities[i];
5512                 this = coding_categories + category;
5513                 if (detect_info.found & (1 << category))
5514                   break;
5515               }
5516           else
5517             for (i = 0; i < coding_category_raw_text; i++)
5518               {
5519                 category = coding_priorities[i];
5520                 this = coding_categories + category;
5521                 if (this->id < 0)
5522                   {
5523                     /* No coding system of this category is defined.  */
5524                     detect_info.rejected |= (1 << category);
5525                   }
5526                 else if (category >= coding_category_raw_text)
5527                   continue;
5528                 else if (detect_info.checked & (1 << category))
5529                   {
5530                     if (detect_info.found & (1 << category))
5531                       break;
5532                   }
5533                 else if ((*(this->detector)) (coding, &detect_info)
5534                          && detect_info.found & (1 << category))
5535                   {
5536                     if (category == coding_category_utf_16_auto)
5537                       {
5538                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5539                           category = coding_category_utf_16_le;
5540                         else
5541                           category = coding_category_utf_16_be;
5542                       }
5543                     break;
5544                   }
5545               }
5546
5547           if (i < coding_category_raw_text)
5548             setup_coding_system (CODING_ID_NAME (this->id), coding);
5549           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5550             setup_coding_system (Qraw_text, coding);
5551           else if (detect_info.rejected)
5552             for (i = 0; i < coding_category_raw_text; i++)
5553               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5554                 {
5555                   this = coding_categories + coding_priorities[i];
5556                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5557                   break;
5558                 }
5559         }
5560     }
5561   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5562            == coding_category_utf_16_auto)
5563     {
5564       Lisp_Object coding_systems;
5565       struct coding_detection_info detect_info;
5566
5567       coding_systems
5568         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5569       detect_info.found = detect_info.rejected = 0;
5570       if (CONSP (coding_systems)
5571           && detect_coding_utf_16 (coding, &detect_info))
5572         {
5573           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5574             setup_coding_system (XCAR (coding_systems), coding);
5575           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5576             setup_coding_system (XCDR (coding_systems), coding);
5577         }
5578     }
5579 }
5580
5581
5582 static void
5583 decode_eol (coding)
5584      struct coding_system *coding;
5585 {
5586   Lisp_Object eol_type;
5587   unsigned char *p, *pbeg, *pend;
5588
5589   eol_type = CODING_ID_EOL_TYPE (coding->id);
5590   if (EQ (eol_type, Qunix))
5591     return;
5592
5593   if (NILP (coding->dst_object))
5594     pbeg = coding->destination;
5595   else
5596     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5597   pend = pbeg + coding->produced;
5598
5599   if (VECTORP (eol_type))
5600     {
5601       int eol_seen = EOL_SEEN_NONE;
5602
5603       for (p = pbeg; p < pend; p++)
5604         {
5605           if (*p == '\n')
5606             eol_seen |= EOL_SEEN_LF;
5607           else if (*p == '\r')
5608             {
5609               if (p + 1 < pend && *(p + 1) == '\n')
5610                 {
5611                   eol_seen |= EOL_SEEN_CRLF;
5612                   p++;
5613                 }
5614               else
5615                 eol_seen |= EOL_SEEN_CR;
5616             }
5617         }
5618       if (eol_seen != EOL_SEEN_NONE
5619           && eol_seen != EOL_SEEN_LF
5620           && eol_seen != EOL_SEEN_CRLF
5621           && eol_seen != EOL_SEEN_CR)
5622         eol_seen = EOL_SEEN_LF;
5623       if (eol_seen != EOL_SEEN_NONE)
5624         eol_type = adjust_coding_eol_type (coding, eol_seen);
5625     }
5626
5627   if (EQ (eol_type, Qmac))
5628     {
5629       for (p = pbeg; p < pend; p++)
5630         if (*p == '\r')
5631           *p = '\n';
5632     }
5633   else if (EQ (eol_type, Qdos))
5634     {
5635       int n = 0;
5636
5637       if (NILP (coding->dst_object))
5638         {
5639           /* Start deleting '\r' from the tail to minimize the memory
5640              movement.  */
5641           for (p = pend - 2; p >= pbeg; p--)
5642             if (*p == '\r')
5643               {
5644                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5645                 n++;
5646               }
5647         }
5648       else
5649         {
5650           int pos_byte = coding->dst_pos_byte;
5651           int pos = coding->dst_pos;
5652           int pos_end = pos + coding->produced_char - 1;
5653
5654           while (pos < pos_end)
5655             {
5656               p = BYTE_POS_ADDR (pos_byte);
5657               if (*p == '\r' && p[1] == '\n')
5658                 {
5659                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5660                   n++;
5661                   pos_end--;
5662                 }
5663               pos++;
5664               pos_byte += BYTES_BY_CHAR_HEAD (*p);
5665             }
5666         }
5667       coding->produced -= n;
5668       coding->produced_char -= n;
5669     }
5670 }
5671
5672
5673 /* Return a translation table (or list of them) from coding system
5674    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5675    decoding (ENCODEP is zero). */
5676
5677 static Lisp_Object
5678 get_translation_table (attrs, encodep, max_lookup)
5679      Lisp_Object attrs;
5680      int encodep, *max_lookup;
5681 {
5682   Lisp_Object standard, translation_table;
5683   Lisp_Object val;
5684
5685   if (encodep)
5686     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5687       standard = Vstandard_translation_table_for_encode;
5688   else
5689     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5690       standard = Vstandard_translation_table_for_decode;
5691   if (NILP (translation_table))
5692     translation_table = standard;
5693   else
5694     {
5695       if (SYMBOLP (translation_table))
5696         translation_table = Fget (translation_table, Qtranslation_table);
5697       else if (CONSP (translation_table))
5698         {
5699           translation_table = Fcopy_sequence (translation_table);
5700           for (val = translation_table; CONSP (val); val = XCDR (val))
5701             if (SYMBOLP (XCAR (val)))
5702               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5703         }
5704       if (CHAR_TABLE_P (standard))
5705         {
5706           if (CONSP (translation_table))
5707             translation_table = nconc2 (translation_table,
5708                                         Fcons (standard, Qnil));
5709           else
5710             translation_table = Fcons (translation_table,
5711                                        Fcons (standard, Qnil));
5712         }
5713     }
5714
5715   if (max_lookup)
5716     {
5717       *max_lookup = 1;
5718       if (CHAR_TABLE_P (translation_table)
5719           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5720         {
5721           val = XCHAR_TABLE (translation_table)->extras[1];
5722           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5723             *max_lookup = XFASTINT (val);
5724         }
5725       else if (CONSP (translation_table))
5726         {
5727           Lisp_Object tail, val;
5728
5729           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5730             if (CHAR_TABLE_P (XCAR (tail))
5731                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5732               {
5733                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5734                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5735                   *max_lookup = XFASTINT (val);
5736               }
5737         }
5738     }
5739   return translation_table;
5740 }
5741
5742 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5743   do {                                                          \
5744     trans = Qnil;                                               \
5745     if (CHAR_TABLE_P (table))                                   \
5746       {                                                         \
5747         trans = CHAR_TABLE_REF (table, c);                      \
5748         if (CHARACTERP (trans))                                 \
5749           c = XFASTINT (trans), trans = Qnil;                   \
5750       }                                                         \
5751     else if (CONSP (table))                                     \
5752       {                                                         \
5753         Lisp_Object tail;                                       \
5754                                                                 \
5755         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5756           if (CHAR_TABLE_P (XCAR (tail)))                       \
5757             {                                                   \
5758               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5759               if (CHARACTERP (trans))                           \
5760                 c = XFASTINT (trans), trans = Qnil;             \
5761               else if (! NILP (trans))                          \
5762                 break;                                          \
5763             }                                                   \
5764       }                                                         \
5765   } while (0)
5766
5767
5768 static Lisp_Object
5769 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5770      Lisp_Object val;
5771      int *buf, *buf_end;
5772      int last_block;
5773      int *from_nchars, *to_nchars;
5774 {
5775   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5776      [TO-CHAR ...].  */
5777   if (CONSP (val))
5778     {
5779       Lisp_Object from, tail;
5780       int i, len;
5781
5782       for (tail = val; CONSP (tail); tail = XCDR (tail))
5783         {
5784           val = XCAR (tail);
5785           from = XCAR (val);
5786           len = ASIZE (from);
5787           for (i = 0; i < len; i++)
5788             {
5789               if (buf + i == buf_end)
5790                 {
5791                   if (! last_block)
5792                     return Qt;
5793                   break;
5794                 }
5795               if (XINT (AREF (from, i)) != buf[i])
5796                 break;
5797             }
5798           if (i == len)
5799             {
5800               val = XCDR (val);
5801               *from_nchars = len;
5802               break;
5803             }
5804         }
5805       if (! CONSP (tail))
5806         return Qnil;
5807     }
5808   if (VECTORP (val))
5809     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5810   else
5811     *buf = XINT (val);
5812   return val;
5813 }
5814
5815
5816 static int
5817 produce_chars (coding, translation_table, last_block)
5818      struct coding_system *coding;
5819      Lisp_Object translation_table;
5820      int last_block;
5821 {
5822   unsigned char *dst = coding->destination + coding->produced;
5823   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5824   int produced;
5825   int produced_chars = 0;
5826   int carryover = 0;
5827
5828   if (! coding->chars_at_source)
5829     {
5830       /* Characters are in coding->charbuf.  */
5831       int *buf = coding->charbuf;
5832       int *buf_end = buf + coding->charbuf_used;
5833
5834       if (BUFFERP (coding->src_object)
5835           && EQ (coding->src_object, coding->dst_object))
5836         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5837
5838       while (buf < buf_end)
5839         {
5840           int c = *buf, i;
5841
5842           if (c >= 0)
5843             {
5844               int from_nchars = 1, to_nchars = 1;
5845               Lisp_Object trans = Qnil;
5846
5847               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5848               if (! NILP (trans))
5849                 {
5850                   trans = get_translation (trans, buf, buf_end, last_block,
5851                                            &from_nchars, &to_nchars);
5852                   if (EQ (trans, Qt))
5853                     break;
5854                   c = *buf;
5855                 }
5856
5857               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5858                 {
5859                   dst = alloc_destination (coding,
5860                                            buf_end - buf
5861                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5862                                            dst);
5863                   dst_end = coding->destination + coding->dst_bytes;
5864                 }
5865
5866               for (i = 0; i < to_nchars; i++)
5867                 {
5868                   if (i > 0)
5869                     c = XINT (AREF (trans, i));
5870                   if (coding->dst_multibyte
5871                       || ! CHAR_BYTE8_P (c))
5872                     CHAR_STRING_ADVANCE (c, dst);
5873                   else
5874                     *dst++ = CHAR_TO_BYTE8 (c);
5875                 }
5876               produced_chars += to_nchars;
5877               *buf++ = to_nchars;
5878               while (--from_nchars > 0)
5879                 *buf++ = 0;
5880             }
5881           else
5882             /* This is an annotation datum.  (-C) is the length.  */
5883             buf += -c;
5884         }
5885       carryover = buf_end - buf;
5886     }
5887   else
5888     {
5889       const unsigned char *src = coding->source;
5890       const unsigned char *src_end = src + coding->src_bytes;
5891       Lisp_Object eol_type;
5892
5893       eol_type = CODING_ID_EOL_TYPE (coding->id);
5894
5895       if (coding->src_multibyte != coding->dst_multibyte)
5896         {
5897           if (coding->src_multibyte)
5898             {
5899               int multibytep = 1;
5900               int consumed_chars;
5901
5902               while (1)
5903                 {
5904                   const unsigned char *src_base = src;
5905                   int c;
5906
5907                   ONE_MORE_BYTE (c);
5908                   if (c == '\r')
5909                     {
5910                       if (EQ (eol_type, Qdos))
5911                         {
5912                           if (src == src_end)
5913                             {
5914                               record_conversion_result
5915                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5916                               goto no_more_source;
5917                             }
5918                           if (*src == '\n')
5919                             c = *src++;
5920                         }
5921                       else if (EQ (eol_type, Qmac))
5922                         c = '\n';
5923                     }
5924                   if (dst == dst_end)
5925                     {
5926                       coding->consumed = src - coding->source;
5927
5928                     if (EQ (coding->src_object, coding->dst_object))
5929                       dst_end = (unsigned char *) src;
5930                     if (dst == dst_end)
5931                       {
5932                         dst = alloc_destination (coding, src_end - src + 1,
5933                                                  dst);
5934                         dst_end = coding->destination + coding->dst_bytes;
5935                         coding_set_source (coding);
5936                         src = coding->source + coding->consumed;
5937                         src_end = coding->source + coding->src_bytes;
5938                       }
5939                     }
5940                   *dst++ = c;
5941                   produced_chars++;
5942                 }
5943             no_more_source:
5944               ;
5945             }
5946           else
5947             while (src < src_end)
5948               {
5949                 int multibytep = 1;
5950                 int c = *src++;
5951
5952                 if (c == '\r')
5953                   {
5954                     if (EQ (eol_type, Qdos))
5955                       {
5956                         if (src < src_end
5957                             && *src == '\n')
5958                           c = *src++;
5959                       }
5960                     else if (EQ (eol_type, Qmac))
5961                       c = '\n';
5962                   }
5963                 if (dst >= dst_end - 1)
5964                   {
5965                     coding->consumed = src - coding->source;
5966
5967                     if (EQ (coding->src_object, coding->dst_object))
5968                       dst_end = (unsigned char *) src;
5969                     if (dst >= dst_end - 1)
5970                       {
5971                         dst = alloc_destination (coding, src_end - src + 2,
5972                                                  dst);
5973                         dst_end = coding->destination + coding->dst_bytes;
5974                         coding_set_source (coding);
5975                         src = coding->source + coding->consumed;
5976                         src_end = coding->source + coding->src_bytes;
5977                       }
5978                   }
5979                 EMIT_ONE_BYTE (c);
5980               }
5981         }
5982       else
5983         {
5984           if (!EQ (coding->src_object, coding->dst_object))
5985             {
5986               int require = coding->src_bytes - coding->dst_bytes;
5987
5988               if (require > 0)
5989                 {
5990                   EMACS_INT offset = src - coding->source;
5991
5992                   dst = alloc_destination (coding, require, dst);
5993                   coding_set_source (coding);
5994                   src = coding->source + offset;
5995                   src_end = coding->source + coding->src_bytes;
5996                 }
5997             }
5998           produced_chars = coding->src_chars;
5999           while (src < src_end)
6000             {
6001               int c = *src++;
6002
6003               if (c == '\r')
6004                 {
6005                   if (EQ (eol_type, Qdos))
6006                     {
6007                       if (src < src_end
6008                           && *src == '\n')
6009                         c = *src++;
6010                       produced_chars--;
6011                     }
6012                   else if (EQ (eol_type, Qmac))
6013                     c = '\n';
6014                 }
6015               *dst++ = c;
6016             }
6017         }
6018       coding->consumed = coding->src_bytes;
6019       coding->consumed_char = coding->src_chars;
6020     }
6021
6022   produced = dst - (coding->destination + coding->produced);
6023   if (BUFFERP (coding->dst_object))
6024     insert_from_gap (produced_chars, produced);
6025   coding->produced += produced;
6026   coding->produced_char += produced_chars;
6027   return carryover;
6028 }
6029
6030 /* Compose text in CODING->object according to the annotation data at
6031    CHARBUF.  CHARBUF is an array:
6032      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6033  */
6034
6035 static INLINE void
6036 produce_composition (coding, charbuf, pos)
6037      struct coding_system *coding;
6038      int *charbuf;
6039      EMACS_INT pos;
6040 {
6041   int len;
6042   EMACS_INT to;
6043   enum composition_method method;
6044   Lisp_Object components;
6045
6046   len = -charbuf[0];
6047   to = pos + charbuf[2];
6048   if (to <= pos)
6049     return;
6050   method = (enum composition_method) (charbuf[3]);
6051
6052   if (method == COMPOSITION_RELATIVE)
6053     components = Qnil;
6054   else if (method >= COMPOSITION_WITH_RULE
6055            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6056     {
6057       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6058       int i;
6059
6060       len -= 4;
6061       charbuf += 4;
6062       for (i = 0; i < len; i++)
6063         {
6064           args[i] = make_number (charbuf[i]);
6065           if (charbuf[i] < 0)
6066             return;
6067         }
6068       components = (method == COMPOSITION_WITH_ALTCHARS
6069                     ? Fstring (len, args) : Fvector (len, args));
6070     }
6071   else
6072     return;
6073   compose_text (pos, to, components, Qnil, coding->dst_object);
6074 }
6075
6076
6077 /* Put `charset' property on text in CODING->object according to
6078    the annotation data at CHARBUF.  CHARBUF is an array:
6079      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6080  */
6081
6082 static INLINE void
6083 produce_charset (coding, charbuf, pos)
6084      struct coding_system *coding;
6085      int *charbuf;
6086      EMACS_INT pos;
6087 {
6088   EMACS_INT from = pos - charbuf[2];
6089   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6090
6091   Fput_text_property (make_number (from), make_number (pos),
6092                       Qcharset, CHARSET_NAME (charset),
6093                       coding->dst_object);
6094 }
6095
6096
6097 #define CHARBUF_SIZE 0x4000
6098
6099 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6100   do {                                                                  \
6101     int size = CHARBUF_SIZE;;                                           \
6102                                                                         \
6103     coding->charbuf = NULL;                                             \
6104     while (size > 1024)                                                 \
6105       {                                                                 \
6106         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6107         if (coding->charbuf)                                            \
6108           break;                                                        \
6109         size >>= 1;                                                     \
6110       }                                                                 \
6111     if (! coding->charbuf)                                              \
6112       {                                                                 \
6113         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6114         return coding->result;                                          \
6115       }                                                                 \
6116     coding->charbuf_size = size;                                        \
6117   } while (0)
6118
6119
6120 static void
6121 produce_annotation (coding, pos)
6122      struct coding_system *coding;
6123      EMACS_INT pos;
6124 {
6125   int *charbuf = coding->charbuf;
6126   int *charbuf_end = charbuf + coding->charbuf_used;
6127
6128   if (NILP (coding->dst_object))
6129     return;
6130
6131   while (charbuf < charbuf_end)
6132     {
6133       if (*charbuf >= 0)
6134         pos += *charbuf++;
6135       else
6136         {
6137           int len = -*charbuf;
6138           switch (charbuf[1])
6139             {
6140             case CODING_ANNOTATE_COMPOSITION_MASK:
6141               produce_composition (coding, charbuf, pos);
6142               break;
6143             case CODING_ANNOTATE_CHARSET_MASK:
6144               produce_charset (coding, charbuf, pos);
6145               break;
6146             default:
6147               abort ();
6148             }
6149           charbuf += len;
6150         }
6151     }
6152 }
6153
6154 /* Decode the data at CODING->src_object into CODING->dst_object.
6155    CODING->src_object is a buffer, a string, or nil.
6156    CODING->dst_object is a buffer.
6157
6158    If CODING->src_object is a buffer, it must be the current buffer.
6159    In this case, if CODING->src_pos is positive, it is a position of
6160    the source text in the buffer, otherwise, the source text is in the
6161    gap area of the buffer, and CODING->src_pos specifies the offset of
6162    the text from GPT (which must be the same as PT).  If this is the
6163    same buffer as CODING->dst_object, CODING->src_pos must be
6164    negative.
6165
6166    If CODING->src_object is a string, CODING->src_pos is an index to
6167    that string.
6168
6169    If CODING->src_object is nil, CODING->source must already point to
6170    the non-relocatable memory area.  In this case, CODING->src_pos is
6171    an offset from CODING->source.
6172
6173    The decoded data is inserted at the current point of the buffer
6174    CODING->dst_object.
6175 */
6176
6177 static int
6178 decode_coding (coding)
6179      struct coding_system *coding;
6180 {
6181   Lisp_Object attrs;
6182   Lisp_Object undo_list;
6183   Lisp_Object translation_table;
6184   int carryover;
6185   int i;
6186
6187   if (BUFFERP (coding->src_object)
6188       && coding->src_pos > 0
6189       && coding->src_pos < GPT
6190       && coding->src_pos + coding->src_chars > GPT)
6191     move_gap_both (coding->src_pos, coding->src_pos_byte);
6192
6193   undo_list = Qt;
6194   if (BUFFERP (coding->dst_object))
6195     {
6196       if (current_buffer != XBUFFER (coding->dst_object))
6197         set_buffer_internal (XBUFFER (coding->dst_object));
6198       if (GPT != PT)
6199         move_gap_both (PT, PT_BYTE);
6200       undo_list = current_buffer->undo_list;
6201       current_buffer->undo_list = Qt;
6202     }
6203
6204   coding->consumed = coding->consumed_char = 0;
6205   coding->produced = coding->produced_char = 0;
6206   coding->chars_at_source = 0;
6207   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6208   coding->errors = 0;
6209
6210   ALLOC_CONVERSION_WORK_AREA (coding);
6211
6212   attrs = CODING_ID_ATTRS (coding->id);
6213   translation_table = get_translation_table (attrs, 0, NULL);
6214
6215   carryover = 0;
6216   do
6217     {
6218       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6219
6220       coding_set_source (coding);
6221       coding->annotated = 0;
6222       coding->charbuf_used = carryover;
6223       (*(coding->decoder)) (coding);
6224       coding_set_destination (coding);
6225       carryover = produce_chars (coding, translation_table, 0);
6226       if (coding->annotated)
6227         produce_annotation (coding, pos);
6228       for (i = 0; i < carryover; i++)
6229         coding->charbuf[i]
6230           = coding->charbuf[coding->charbuf_used - carryover + i];
6231     }
6232   while (coding->consumed < coding->src_bytes
6233          && (coding->result == CODING_RESULT_SUCCESS
6234              || coding->result == CODING_RESULT_INVALID_SRC));
6235
6236   if (carryover > 0)
6237     {
6238       coding_set_destination (coding);
6239       coding->charbuf_used = carryover;
6240       produce_chars (coding, translation_table, 1);
6241     }
6242
6243   coding->carryover_bytes = 0;
6244   if (coding->consumed < coding->src_bytes)
6245     {
6246       int nbytes = coding->src_bytes - coding->consumed;
6247       const unsigned char *src;
6248
6249       coding_set_source (coding);
6250       coding_set_destination (coding);
6251       src = coding->source + coding->consumed;
6252
6253       if (coding->mode & CODING_MODE_LAST_BLOCK)
6254         {
6255           /* Flush out unprocessed data as binary chars.  We are sure
6256              that the number of data is less than the size of
6257              coding->charbuf.  */
6258           coding->charbuf_used = 0;
6259           while (nbytes-- > 0)
6260             {
6261               int c = *src++;
6262
6263               if (c & 0x80)
6264                 c = BYTE8_TO_CHAR (c);
6265               coding->charbuf[coding->charbuf_used++] = c;
6266             }
6267           produce_chars (coding, Qnil, 1);
6268         }
6269       else
6270         {
6271           /* Record unprocessed bytes in coding->carryover.  We are
6272              sure that the number of data is less than the size of
6273              coding->carryover.  */
6274           unsigned char *p = coding->carryover;
6275
6276           coding->carryover_bytes = nbytes;
6277           while (nbytes-- > 0)
6278             *p++ = *src++;
6279         }
6280       coding->consumed = coding->src_bytes;
6281     }
6282
6283   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6284     decode_eol (coding);
6285   if (BUFFERP (coding->dst_object))
6286     {
6287       current_buffer->undo_list = undo_list;
6288       record_insert (coding->dst_pos, coding->produced_char);
6289     }
6290   return coding->result;
6291 }
6292
6293
6294 /* Extract an annotation datum from a composition starting at POS and
6295    ending before LIMIT of CODING->src_object (buffer or string), store
6296    the data in BUF, set *STOP to a starting position of the next
6297    composition (if any) or to LIMIT, and return the address of the
6298    next element of BUF.
6299
6300    If such an annotation is not found, set *STOP to a starting
6301    position of a composition after POS (if any) or to LIMIT, and
6302    return BUF.  */
6303
6304 static INLINE int *
6305 handle_composition_annotation (pos, limit, coding, buf, stop)
6306      EMACS_INT pos, limit;
6307      struct coding_system *coding;
6308      int *buf;
6309      EMACS_INT *stop;
6310 {
6311   EMACS_INT start, end;
6312   Lisp_Object prop;
6313
6314   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6315       || end > limit)
6316     *stop = limit;
6317   else if (start > pos)
6318     *stop = start;
6319   else
6320     {
6321       if (start == pos)
6322         {
6323           /* We found a composition.  Store the corresponding
6324              annotation data in BUF.  */
6325           int *head = buf;
6326           enum composition_method method = COMPOSITION_METHOD (prop);
6327           int nchars = COMPOSITION_LENGTH (prop);
6328
6329           ADD_COMPOSITION_DATA (buf, nchars, method);
6330           if (method != COMPOSITION_RELATIVE)
6331             {
6332               Lisp_Object components;
6333               int len, i, i_byte;
6334
6335               components = COMPOSITION_COMPONENTS (prop);
6336               if (VECTORP (components))
6337                 {
6338                   len = XVECTOR (components)->size;
6339                   for (i = 0; i < len; i++)
6340                     *buf++ = XINT (AREF (components, i));
6341                 }
6342               else if (STRINGP (components))
6343                 {
6344                   len = SCHARS (components);
6345                   i = i_byte = 0;
6346                   while (i < len)
6347                     {
6348                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6349                       buf++;
6350                     }
6351                 }
6352               else if (INTEGERP (components))
6353                 {
6354                   len = 1;
6355                   *buf++ = XINT (components);
6356                 }
6357               else if (CONSP (components))
6358                 {
6359                   for (len = 0; CONSP (components);
6360                        len++, components = XCDR (components))
6361                     *buf++ = XINT (XCAR (components));
6362                 }
6363               else
6364                 abort ();
6365               *head -= len;
6366             }
6367         }
6368
6369       if (find_composition (end, limit, &start, &end, &prop,
6370                             coding->src_object)
6371           && end <= limit)
6372         *stop = start;
6373       else
6374         *stop = limit;
6375     }
6376   return buf;
6377 }
6378
6379
6380 /* Extract an annotation datum from a text property `charset' at POS of
6381    CODING->src_object (buffer of string), store the data in BUF, set
6382    *STOP to the position where the value of `charset' property changes
6383    (limiting by LIMIT), and return the address of the next element of
6384    BUF.
6385
6386    If the property value is nil, set *STOP to the position where the
6387    property value is non-nil (limiting by LIMIT), and return BUF.  */
6388
6389 static INLINE int *
6390 handle_charset_annotation (pos, limit, coding, buf, stop)
6391      EMACS_INT pos, limit;
6392      struct coding_system *coding;
6393      int *buf;
6394      EMACS_INT *stop;
6395 {
6396   Lisp_Object val, next;
6397   int id;
6398
6399   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6400   if (! NILP (val) && CHARSETP (val))
6401     id = XINT (CHARSET_SYMBOL_ID (val));
6402   else
6403     id = -1;
6404   ADD_CHARSET_DATA (buf, 0, id);
6405   next = Fnext_single_property_change (make_number (pos), Qcharset,
6406                                        coding->src_object,
6407                                        make_number (limit));
6408   *stop = XINT (next);
6409   return buf;
6410 }
6411
6412
6413 static void
6414 consume_chars (coding, translation_table, max_lookup)
6415      struct coding_system *coding;
6416      Lisp_Object translation_table;
6417      int max_lookup;
6418 {
6419   int *buf = coding->charbuf;
6420   int *buf_end = coding->charbuf + coding->charbuf_size;
6421   const unsigned char *src = coding->source + coding->consumed;
6422   const unsigned char *src_end = coding->source + coding->src_bytes;
6423   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6424   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6425   int multibytep = coding->src_multibyte;
6426   Lisp_Object eol_type;
6427   int c;
6428   EMACS_INT stop, stop_composition, stop_charset;
6429   int *lookup_buf = NULL;
6430
6431   if (! NILP (translation_table))
6432     lookup_buf = alloca (sizeof (int) * max_lookup);
6433
6434   eol_type = CODING_ID_EOL_TYPE (coding->id);
6435   if (VECTORP (eol_type))
6436     eol_type = Qunix;
6437
6438   /* Note: composition handling is not yet implemented.  */
6439   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6440
6441   if (NILP (coding->src_object))
6442     stop = stop_composition = stop_charset = end_pos;
6443   else
6444     {
6445       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6446         stop = stop_composition = pos;
6447       else
6448         stop = stop_composition = end_pos;
6449       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6450         stop = stop_charset = pos;
6451       else
6452         stop_charset = end_pos;
6453     }
6454
6455   /* Compensate for CRLF and conversion.  */
6456   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6457   while (buf < buf_end)
6458     {
6459       Lisp_Object trans;
6460
6461       if (pos == stop)
6462         {
6463           if (pos == end_pos)
6464             break;
6465           if (pos == stop_composition)
6466             buf = handle_composition_annotation (pos, end_pos, coding,
6467                                                  buf, &stop_composition);
6468           if (pos == stop_charset)
6469             buf = handle_charset_annotation (pos, end_pos, coding,
6470                                              buf, &stop_charset);
6471           stop = (stop_composition < stop_charset
6472                   ? stop_composition : stop_charset);
6473         }
6474
6475       if (! multibytep)
6476         {
6477           EMACS_INT bytes;
6478
6479           if (coding->encoder == encode_coding_raw_text)
6480             c = *src++, pos++;
6481           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6482             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6483           else
6484             c = BYTE8_TO_CHAR (*src), src++, pos++;
6485         }
6486       else
6487         c = STRING_CHAR_ADVANCE (src), pos++;
6488       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6489         c = '\n';
6490       if (! EQ (eol_type, Qunix))
6491         {
6492           if (c == '\n')
6493             {
6494               if (EQ (eol_type, Qdos))
6495                 *buf++ = '\r';
6496               else
6497                 c = '\r';
6498             }
6499         }
6500
6501       trans = Qnil;
6502       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6503       if (NILP (trans))
6504         *buf++ = c;
6505       else
6506         {
6507           int from_nchars = 1, to_nchars = 1;
6508           int *lookup_buf_end;
6509           const unsigned char *p = src;
6510           int i;
6511
6512           lookup_buf[0] = c;
6513           for (i = 1; i < max_lookup && p < src_end; i++)
6514             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6515           lookup_buf_end = lookup_buf + i;
6516           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6517                                    &from_nchars, &to_nchars);
6518           if (EQ (trans, Qt)
6519               || buf + to_nchars > buf_end)
6520             break;
6521           *buf++ = *lookup_buf;
6522           for (i = 1; i < to_nchars; i++)
6523             *buf++ = XINT (AREF (trans, i));
6524           for (i = 1; i < from_nchars; i++, pos++)
6525             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6526         }
6527     }
6528
6529   coding->consumed = src - coding->source;
6530   coding->consumed_char = pos - coding->src_pos;
6531   coding->charbuf_used = buf - coding->charbuf;
6532   coding->chars_at_source = 0;
6533 }
6534
6535
6536 /* Encode the text at CODING->src_object into CODING->dst_object.
6537    CODING->src_object is a buffer or a string.
6538    CODING->dst_object is a buffer or nil.
6539
6540    If CODING->src_object is a buffer, it must be the current buffer.
6541    In this case, if CODING->src_pos is positive, it is a position of
6542    the source text in the buffer, otherwise. the source text is in the
6543    gap area of the buffer, and coding->src_pos specifies the offset of
6544    the text from GPT (which must be the same as PT).  If this is the
6545    same buffer as CODING->dst_object, CODING->src_pos must be
6546    negative and CODING should not have `pre-write-conversion'.
6547
6548    If CODING->src_object is a string, CODING should not have
6549    `pre-write-conversion'.
6550
6551    If CODING->dst_object is a buffer, the encoded data is inserted at
6552    the current point of that buffer.
6553
6554    If CODING->dst_object is nil, the encoded data is placed at the
6555    memory area specified by CODING->destination.  */
6556
6557 static int
6558 encode_coding (coding)
6559      struct coding_system *coding;
6560 {
6561   Lisp_Object attrs;
6562   Lisp_Object translation_table;
6563   int max_lookup;
6564
6565   attrs = CODING_ID_ATTRS (coding->id);
6566   if (coding->encoder == encode_coding_raw_text)
6567     translation_table = Qnil, max_lookup = 0;
6568   else
6569     translation_table = get_translation_table (attrs, 1, &max_lookup);
6570
6571   if (BUFFERP (coding->dst_object))
6572     {
6573       set_buffer_internal (XBUFFER (coding->dst_object));
6574       coding->dst_multibyte
6575         = ! NILP (current_buffer->enable_multibyte_characters);
6576     }
6577
6578   coding->consumed = coding->consumed_char = 0;
6579   coding->produced = coding->produced_char = 0;
6580   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6581   coding->errors = 0;
6582
6583   ALLOC_CONVERSION_WORK_AREA (coding);
6584
6585   do {
6586     coding_set_source (coding);
6587     consume_chars (coding, translation_table, max_lookup);
6588     coding_set_destination (coding);
6589     (*(coding->encoder)) (coding);
6590   } while (coding->consumed_char < coding->src_chars);
6591
6592   if (BUFFERP (coding->dst_object))
6593     insert_from_gap (coding->produced_char, coding->produced);
6594
6595   return (coding->result);
6596 }
6597
6598
6599 /* Name (or base name) of work buffer for code conversion.  */
6600 static Lisp_Object Vcode_conversion_workbuf_name;
6601
6602 /* A working buffer used by the top level conversion.  Once it is
6603    created, it is never destroyed.  It has the name
6604    Vcode_conversion_workbuf_name.  The other working buffers are
6605    destroyed after the use is finished, and their names are modified
6606    versions of Vcode_conversion_workbuf_name.  */
6607 static Lisp_Object Vcode_conversion_reused_workbuf;
6608
6609 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6610 static int reused_workbuf_in_use;
6611
6612
6613 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6614    multibyteness of returning buffer.  */
6615
6616 static Lisp_Object
6617 make_conversion_work_buffer (multibyte)
6618      int multibyte;
6619 {
6620   Lisp_Object name, workbuf;
6621   struct buffer *current;
6622
6623   if (reused_workbuf_in_use++)
6624     {
6625       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6626       workbuf = Fget_buffer_create (name);
6627     }
6628   else
6629     {
6630       name = Vcode_conversion_workbuf_name;
6631       workbuf = Fget_buffer_create (name);
6632       if (NILP (Vcode_conversion_reused_workbuf))
6633         Vcode_conversion_reused_workbuf = workbuf;
6634     }
6635   current = current_buffer;
6636   set_buffer_internal (XBUFFER (workbuf));
6637   Ferase_buffer ();
6638   current_buffer->undo_list = Qt;
6639   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6640   set_buffer_internal (current);
6641   return workbuf;
6642 }
6643
6644
6645 static Lisp_Object
6646 code_conversion_restore (arg)
6647      Lisp_Object arg;
6648 {
6649   Lisp_Object current, workbuf;
6650   struct gcpro gcpro1;
6651
6652   GCPRO1 (arg);
6653   current = XCAR (arg);
6654   workbuf = XCDR (arg);
6655   if (! NILP (workbuf))
6656     {
6657       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6658         reused_workbuf_in_use = 0;
6659       else if (! NILP (Fbuffer_live_p (workbuf)))
6660         Fkill_buffer (workbuf);
6661     }
6662   set_buffer_internal (XBUFFER (current));
6663   UNGCPRO;
6664   return Qnil;
6665 }
6666
6667 Lisp_Object
6668 code_conversion_save (with_work_buf, multibyte)
6669      int with_work_buf, multibyte;
6670 {
6671   Lisp_Object workbuf = Qnil;
6672
6673   if (with_work_buf)
6674     workbuf = make_conversion_work_buffer (multibyte);
6675   record_unwind_protect (code_conversion_restore,
6676                          Fcons (Fcurrent_buffer (), workbuf));
6677   return workbuf;
6678 }
6679
6680 int
6681 decode_coding_gap (coding, chars, bytes)
6682      struct coding_system *coding;
6683      EMACS_INT chars, bytes;
6684 {
6685   int count = specpdl_ptr - specpdl;
6686   Lisp_Object attrs;
6687
6688   code_conversion_save (0, 0);
6689
6690   coding->src_object = Fcurrent_buffer ();
6691   coding->src_chars = chars;
6692   coding->src_bytes = bytes;
6693   coding->src_pos = -chars;
6694   coding->src_pos_byte = -bytes;
6695   coding->src_multibyte = chars < bytes;
6696   coding->dst_object = coding->src_object;
6697   coding->dst_pos = PT;
6698   coding->dst_pos_byte = PT_BYTE;
6699   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6700
6701   if (CODING_REQUIRE_DETECTION (coding))
6702     detect_coding (coding);
6703
6704   coding->mode |= CODING_MODE_LAST_BLOCK;
6705   decode_coding (coding);
6706
6707   attrs = CODING_ID_ATTRS (coding->id);
6708   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6709     {
6710       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6711       Lisp_Object val;
6712
6713       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6714       val = call1 (CODING_ATTR_POST_READ (attrs),
6715                    make_number (coding->produced_char));
6716       CHECK_NATNUM (val);
6717       coding->produced_char += Z - prev_Z;
6718       coding->produced += Z_BYTE - prev_Z_BYTE;
6719     }
6720
6721   unbind_to (count, Qnil);
6722   return coding->result;
6723 }
6724
6725 int
6726 encode_coding_gap (coding, chars, bytes)
6727      struct coding_system *coding;
6728      EMACS_INT chars, bytes;
6729 {
6730   int count = specpdl_ptr - specpdl;
6731
6732   code_conversion_save (0, 0);
6733
6734   coding->src_object = Fcurrent_buffer ();
6735   coding->src_chars = chars;
6736   coding->src_bytes = bytes;
6737   coding->src_pos = -chars;
6738   coding->src_pos_byte = -bytes;
6739   coding->src_multibyte = chars < bytes;
6740   coding->dst_object = coding->src_object;
6741   coding->dst_pos = PT;
6742   coding->dst_pos_byte = PT_BYTE;
6743
6744   encode_coding (coding);
6745
6746   unbind_to (count, Qnil);
6747   return coding->result;
6748 }
6749
6750
6751 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6752    SRC_OBJECT into DST_OBJECT by coding context CODING.
6753
6754    SRC_OBJECT is a buffer, a string, or Qnil.
6755
6756    If it is a buffer, the text is at point of the buffer.  FROM and TO
6757    are positions in the buffer.
6758
6759    If it is a string, the text is at the beginning of the string.
6760    FROM and TO are indices to the string.
6761
6762    If it is nil, the text is at coding->source.  FROM and TO are
6763    indices to coding->source.
6764
6765    DST_OBJECT is a buffer, Qt, or Qnil.
6766
6767    If it is a buffer, the decoded text is inserted at point of the
6768    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6769    is deleted.
6770
6771    If it is Qt, a string is made from the decoded text, and
6772    set in CODING->dst_object.
6773
6774    If it is Qnil, the decoded text is stored at CODING->destination.
6775    The caller must allocate CODING->dst_bytes bytes at
6776    CODING->destination by xmalloc.  If the decoded text is longer than
6777    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6778  */
6779
6780 void
6781 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6782                       dst_object)
6783      struct coding_system *coding;
6784      Lisp_Object src_object;
6785      EMACS_INT from, from_byte, to, to_byte;
6786      Lisp_Object dst_object;
6787 {
6788   int count = specpdl_ptr - specpdl;
6789   unsigned char *destination;
6790   EMACS_INT dst_bytes;
6791   EMACS_INT chars = to - from;
6792   EMACS_INT bytes = to_byte - from_byte;
6793   Lisp_Object attrs;
6794   Lisp_Object buffer;
6795   int saved_pt = -1, saved_pt_byte;
6796
6797   buffer = Fcurrent_buffer ();
6798
6799   if (NILP (dst_object))
6800     {
6801       destination = coding->destination;
6802       dst_bytes = coding->dst_bytes;
6803     }
6804
6805   coding->src_object = src_object;
6806   coding->src_chars = chars;
6807   coding->src_bytes = bytes;
6808   coding->src_multibyte = chars < bytes;
6809
6810   if (STRINGP (src_object))
6811     {
6812       coding->src_pos = from;
6813       coding->src_pos_byte = from_byte;
6814     }
6815   else if (BUFFERP (src_object))
6816     {
6817       set_buffer_internal (XBUFFER (src_object));
6818       if (from != GPT)
6819         move_gap_both (from, from_byte);
6820       if (EQ (src_object, dst_object))
6821         {
6822           saved_pt = PT, saved_pt_byte = PT_BYTE;
6823           TEMP_SET_PT_BOTH (from, from_byte);
6824           del_range_both (from, from_byte, to, to_byte, 1);
6825           coding->src_pos = -chars;
6826           coding->src_pos_byte = -bytes;
6827         }
6828       else
6829         {
6830           coding->src_pos = from;
6831           coding->src_pos_byte = from_byte;
6832         }
6833     }
6834
6835   if (CODING_REQUIRE_DETECTION (coding))
6836     detect_coding (coding);
6837   attrs = CODING_ID_ATTRS (coding->id);
6838
6839   if (EQ (dst_object, Qt)
6840       || (! NILP (CODING_ATTR_POST_READ (attrs))
6841           && NILP (dst_object)))
6842     {
6843       coding->dst_object = code_conversion_save (1, 1);
6844       coding->dst_pos = BEG;
6845       coding->dst_pos_byte = BEG_BYTE;
6846       coding->dst_multibyte = 1;
6847     }
6848   else if (BUFFERP (dst_object))
6849     {
6850       code_conversion_save (0, 0);
6851       coding->dst_object = dst_object;
6852       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6853       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6854       coding->dst_multibyte
6855         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6856     }
6857   else
6858     {
6859       code_conversion_save (0, 0);
6860       coding->dst_object = Qnil;
6861       coding->dst_multibyte = 1;
6862     }
6863
6864   decode_coding (coding);
6865
6866   if (BUFFERP (coding->dst_object))
6867     set_buffer_internal (XBUFFER (coding->dst_object));
6868
6869   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6870     {
6871       struct gcpro gcpro1, gcpro2;
6872       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6873       Lisp_Object val;
6874
6875       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6876       GCPRO2 (coding->src_object, coding->dst_object);
6877       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6878                         make_number (coding->produced_char));
6879       UNGCPRO;
6880       CHECK_NATNUM (val);
6881       coding->produced_char += Z - prev_Z;
6882       coding->produced += Z_BYTE - prev_Z_BYTE;
6883     }
6884
6885   if (EQ (dst_object, Qt))
6886     {
6887       coding->dst_object = Fbuffer_string ();
6888     }
6889   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6890     {
6891       set_buffer_internal (XBUFFER (coding->dst_object));
6892       if (dst_bytes < coding->produced)
6893         {
6894           destination
6895             = (unsigned char *) xrealloc (destination, coding->produced);
6896           if (! destination)
6897             {
6898               record_conversion_result (coding,
6899                                         CODING_RESULT_INSUFFICIENT_DST);
6900               unbind_to (count, Qnil);
6901               return;
6902             }
6903           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6904             move_gap_both (BEGV, BEGV_BYTE);
6905           bcopy (BEGV_ADDR, destination, coding->produced);
6906           coding->destination = destination;
6907         }
6908     }
6909
6910   if (saved_pt >= 0)
6911     {
6912       /* This is the case of:
6913          (BUFFERP (src_object) && EQ (src_object, dst_object))
6914          As we have moved PT while replacing the original buffer
6915          contents, we must recover it now.  */
6916       set_buffer_internal (XBUFFER (src_object));
6917       if (saved_pt < from)
6918         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6919       else if (saved_pt < from + chars)
6920         TEMP_SET_PT_BOTH (from, from_byte);
6921       else if (! NILP (current_buffer->enable_multibyte_characters))
6922         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6923                           saved_pt_byte + (coding->produced - bytes));
6924       else
6925         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6926                           saved_pt_byte + (coding->produced - bytes));
6927     }
6928
6929   unbind_to (count, coding->dst_object);
6930 }
6931
6932
6933 void
6934 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6935                       dst_object)
6936      struct coding_system *coding;
6937      Lisp_Object src_object;
6938      EMACS_INT from, from_byte, to, to_byte;
6939      Lisp_Object dst_object;
6940 {
6941   int count = specpdl_ptr - specpdl;
6942   EMACS_INT chars = to - from;
6943   EMACS_INT bytes = to_byte - from_byte;
6944   Lisp_Object attrs;
6945   Lisp_Object buffer;
6946   int saved_pt = -1, saved_pt_byte;
6947   int kill_src_buffer = 0;
6948
6949   buffer = Fcurrent_buffer ();
6950
6951   coding->src_object = src_object;
6952   coding->src_chars = chars;
6953   coding->src_bytes = bytes;
6954   coding->src_multibyte = chars < bytes;
6955
6956   attrs = CODING_ID_ATTRS (coding->id);
6957
6958   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6959     {
6960       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6961       set_buffer_internal (XBUFFER (coding->src_object));
6962       if (STRINGP (src_object))
6963         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6964       else if (BUFFERP (src_object))
6965         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6966       else
6967         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6968
6969       if (EQ (src_object, dst_object))
6970         {
6971           set_buffer_internal (XBUFFER (src_object));
6972           saved_pt = PT, saved_pt_byte = PT_BYTE;
6973           del_range_both (from, from_byte, to, to_byte, 1);
6974           set_buffer_internal (XBUFFER (coding->src_object));
6975         }
6976
6977       {
6978         Lisp_Object args[3];
6979
6980         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6981         args[1] = make_number (BEG);
6982         args[2] = make_number (Z);
6983         safe_call (3, args);
6984       }
6985       if (XBUFFER (coding->src_object) != current_buffer)
6986         kill_src_buffer = 1;
6987       coding->src_object = Fcurrent_buffer ();
6988       if (BEG != GPT)
6989         move_gap_both (BEG, BEG_BYTE);
6990       coding->src_chars = Z - BEG;
6991       coding->src_bytes = Z_BYTE - BEG_BYTE;
6992       coding->src_pos = BEG;
6993       coding->src_pos_byte = BEG_BYTE;
6994       coding->src_multibyte = Z < Z_BYTE;
6995     }
6996   else if (STRINGP (src_object))
6997     {
6998       code_conversion_save (0, 0);
6999       coding->src_pos = from;
7000       coding->src_pos_byte = from_byte;
7001     }
7002   else if (BUFFERP (src_object))
7003     {
7004       code_conversion_save (0, 0);
7005       set_buffer_internal (XBUFFER (src_object));
7006       if (EQ (src_object, dst_object))
7007         {
7008           saved_pt = PT, saved_pt_byte = PT_BYTE;
7009           coding->src_object = del_range_1 (from, to, 1, 1);
7010           coding->src_pos = 0;
7011           coding->src_pos_byte = 0;
7012         }
7013       else
7014         {
7015           if (from < GPT && to >= GPT)
7016             move_gap_both (from, from_byte);
7017           coding->src_pos = from;
7018           coding->src_pos_byte = from_byte;
7019         }
7020     }
7021   else
7022     code_conversion_save (0, 0);
7023
7024   if (BUFFERP (dst_object))
7025     {
7026       coding->dst_object = dst_object;
7027       if (EQ (src_object, dst_object))
7028         {
7029           coding->dst_pos = from;
7030           coding->dst_pos_byte = from_byte;
7031         }
7032       else
7033         {
7034           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7035           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7036         }
7037       coding->dst_multibyte
7038         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7039     }
7040   else if (EQ (dst_object, Qt))
7041     {
7042       coding->dst_object = Qnil;
7043       coding->dst_bytes = coding->src_chars;
7044       if (coding->dst_bytes == 0)
7045         coding->dst_bytes = 1;
7046       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7047       coding->dst_multibyte = 0;
7048     }
7049   else
7050     {
7051       coding->dst_object = Qnil;
7052       coding->dst_multibyte = 0;
7053     }
7054
7055   encode_coding (coding);
7056
7057   if (EQ (dst_object, Qt))
7058     {
7059       if (BUFFERP (coding->dst_object))
7060         coding->dst_object = Fbuffer_string ();
7061       else
7062         {
7063           coding->dst_object
7064             = make_unibyte_string ((char *) coding->destination,
7065                                    coding->produced);
7066           xfree (coding->destination);
7067         }
7068     }
7069
7070   if (saved_pt >= 0)
7071     {
7072       /* This is the case of:
7073          (BUFFERP (src_object) && EQ (src_object, dst_object))
7074          As we have moved PT while replacing the original buffer
7075          contents, we must recover it now.  */
7076       set_buffer_internal (XBUFFER (src_object));
7077       if (saved_pt < from)
7078         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7079       else if (saved_pt < from + chars)
7080         TEMP_SET_PT_BOTH (from, from_byte);
7081       else if (! NILP (current_buffer->enable_multibyte_characters))
7082         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7083                           saved_pt_byte + (coding->produced - bytes));
7084       else
7085         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7086                           saved_pt_byte + (coding->produced - bytes));
7087     }
7088
7089   if (kill_src_buffer)
7090     Fkill_buffer (coding->src_object);
7091   unbind_to (count, Qnil);
7092 }
7093
7094
7095 Lisp_Object
7096 preferred_coding_system ()
7097 {
7098   int id = coding_categories[coding_priorities[0]].id;
7099
7100   return CODING_ID_NAME (id);
7101 }
7102
7103 \f
7104 #ifdef emacs
7105 /*** 8. Emacs Lisp library functions ***/
7106
7107 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7108        doc: /* Return t if OBJECT is nil or a coding-system.
7109 See the documentation of `define-coding-system' for information
7110 about coding-system objects.  */)
7111      (obj)
7112      Lisp_Object obj;
7113 {
7114   if (NILP (obj)
7115       || CODING_SYSTEM_ID (obj) >= 0)
7116     return Qt;
7117   if (! SYMBOLP (obj)
7118       || NILP (Fget (obj, Qcoding_system_define_form)))
7119     return Qnil;
7120   return Qt;
7121 }
7122
7123 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7124        Sread_non_nil_coding_system, 1, 1, 0,
7125        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7126      (prompt)
7127      Lisp_Object prompt;
7128 {
7129   Lisp_Object val;
7130   do
7131     {
7132       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7133                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7134     }
7135   while (SCHARS (val) == 0);
7136   return (Fintern (val, Qnil));
7137 }
7138
7139 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7140        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7141 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7142      (prompt, default_coding_system)
7143      Lisp_Object prompt, default_coding_system;
7144 {
7145   Lisp_Object val;
7146   if (SYMBOLP (default_coding_system))
7147     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7148   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7149                           Qt, Qnil, Qcoding_system_history,
7150                           default_coding_system, Qnil);
7151   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7152 }
7153
7154 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7155        1, 1, 0,
7156        doc: /* Check validity of CODING-SYSTEM.
7157 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7158 It is valid if it is nil or a symbol defined as a coding system by the
7159 function `define-coding-system'.  */)
7160   (coding_system)
7161      Lisp_Object coding_system;
7162 {
7163   Lisp_Object define_form;
7164
7165   define_form = Fget (coding_system, Qcoding_system_define_form);
7166   if (! NILP (define_form))
7167     {
7168       Fput (coding_system, Qcoding_system_define_form, Qnil);
7169       safe_eval (define_form);
7170     }
7171   if (!NILP (Fcoding_system_p (coding_system)))
7172     return coding_system;
7173   while (1)
7174     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7175 }
7176
7177 \f
7178 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7179    HIGHEST is nonzero, return the coding system of the highest
7180    priority among the detected coding systems.  Otherwize return a
7181    list of detected coding systems sorted by their priorities.  If
7182    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7183    multibyte form but contains only ASCII and eight-bit chars.
7184    Otherwise, the bytes are raw bytes.
7185
7186    CODING-SYSTEM controls the detection as below:
7187
7188    If it is nil, detect both text-format and eol-format.  If the
7189    text-format part of CODING-SYSTEM is already specified
7190    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7191    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7192    detect only text-format.  */
7193
7194 Lisp_Object
7195 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7196                       coding_system)
7197      const unsigned char *src;
7198      int src_chars, src_bytes, highest;
7199      int multibytep;
7200      Lisp_Object coding_system;
7201 {
7202   const unsigned char *src_end = src + src_bytes;
7203   Lisp_Object attrs, eol_type;
7204   Lisp_Object val;
7205   struct coding_system coding;
7206   int id;
7207   struct coding_detection_info detect_info;
7208   enum coding_category base_category;
7209
7210   if (NILP (coding_system))
7211     coding_system = Qundecided;
7212   setup_coding_system (coding_system, &coding);
7213   attrs = CODING_ID_ATTRS (coding.id);
7214   eol_type = CODING_ID_EOL_TYPE (coding.id);
7215   coding_system = CODING_ATTR_BASE_NAME (attrs);
7216
7217   coding.source = src;
7218   coding.src_chars = src_chars;
7219   coding.src_bytes = src_bytes;
7220   coding.src_multibyte = multibytep;
7221   coding.consumed = 0;
7222   coding.mode |= CODING_MODE_LAST_BLOCK;
7223
7224   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7225
7226   /* At first, detect text-format if necessary.  */
7227   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7228   if (base_category == coding_category_undecided)
7229     {
7230       enum coding_category category;
7231       struct coding_system *this;
7232       int c, i;
7233
7234       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7235       for (i = 0; src < src_end; i++, src++)
7236         {
7237           c = *src;
7238           if (c & 0x80)
7239             break;
7240           if (c < 0x20
7241               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7242               && inhibit_iso_escape_detection)
7243             {
7244               coding.head_ascii = src - coding.source;
7245               if (detect_coding_iso_2022 (&coding, &detect_info))
7246                 {
7247                   /* We have scanned the whole data.  */
7248                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7249                     /* We didn't find an 8-bit code.  */
7250                     src = src_end;
7251                   break;
7252                 }
7253             }
7254         }
7255       coding.head_ascii = src - coding.source;
7256
7257       if (src < src_end
7258           || detect_info.found)
7259         {
7260           if (src == src_end)
7261             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7262             for (i = 0; i < coding_category_raw_text; i++)
7263               {
7264                 category = coding_priorities[i];
7265                 if (detect_info.found & (1 << category))
7266                   break;
7267               }
7268           else
7269             for (i = 0; i < coding_category_raw_text; i++)
7270               {
7271                 category = coding_priorities[i];
7272                 this = coding_categories + category;
7273
7274                 if (this->id < 0)
7275                   {
7276                     /* No coding system of this category is defined.  */
7277                     detect_info.rejected |= (1 << category);
7278                   }
7279                 else if (category >= coding_category_raw_text)
7280                   continue;
7281                 else if (detect_info.checked & (1 << category))
7282                   {
7283                     if (highest
7284                         && (detect_info.found & (1 << category)))
7285                       break;
7286                   }
7287                 else
7288                   {
7289                     if ((*(this->detector)) (&coding, &detect_info)
7290                         && highest
7291                         && (detect_info.found & (1 << category)))
7292                       {
7293                         if (category == coding_category_utf_16_auto)
7294                           {
7295                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7296                               category = coding_category_utf_16_le;
7297                             else
7298                               category = coding_category_utf_16_be;
7299                           }
7300                         break;
7301                       }
7302                   }
7303               }
7304         }
7305
7306       if (detect_info.rejected == CATEGORY_MASK_ANY)
7307         {
7308           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7309           id = coding_categories[coding_category_raw_text].id;
7310           val = Fcons (make_number (id), Qnil);
7311         }
7312       else if (! detect_info.rejected && ! detect_info.found)
7313         {
7314           detect_info.found = CATEGORY_MASK_ANY;
7315           id = coding_categories[coding_category_undecided].id;
7316           val = Fcons (make_number (id), Qnil);
7317         }
7318       else if (highest)
7319         {
7320           if (detect_info.found)
7321             {
7322               detect_info.found = 1 << category;
7323               val = Fcons (make_number (this->id), Qnil);
7324             }
7325           else
7326             for (i = 0; i < coding_category_raw_text; i++)
7327               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7328                 {
7329                   detect_info.found = 1 << coding_priorities[i];
7330                   id = coding_categories[coding_priorities[i]].id;
7331                   val = Fcons (make_number (id), Qnil);
7332                   break;
7333                 }
7334         }
7335       else
7336         {
7337           int mask = detect_info.rejected | detect_info.found;
7338           int found = 0;
7339           val = Qnil;
7340
7341           for (i = coding_category_raw_text - 1; i >= 0; i--)
7342             {
7343               category = coding_priorities[i];
7344               if (! (mask & (1 << category)))
7345                 {
7346                   found |= 1 << category;
7347                   id = coding_categories[category].id;
7348                   val = Fcons (make_number (id), val);
7349                 }
7350             }
7351           for (i = coding_category_raw_text - 1; i >= 0; i--)
7352             {
7353               category = coding_priorities[i];
7354               if (detect_info.found & (1 << category))
7355                 {
7356                   id = coding_categories[category].id;
7357                   val = Fcons (make_number (id), val);
7358                 }
7359             }
7360           detect_info.found |= found;
7361         }
7362     }
7363   else if (base_category == coding_category_utf_16_auto)
7364     {
7365       if (detect_coding_utf_16 (&coding, &detect_info))
7366         {
7367           struct coding_system *this;
7368
7369           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7370             this = coding_categories + coding_category_utf_16_le;
7371           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7372             this = coding_categories + coding_category_utf_16_be;
7373           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7374             this = coding_categories + coding_category_utf_16_be_nosig;
7375           else
7376             this = coding_categories + coding_category_utf_16_le_nosig;
7377           val = Fcons (make_number (this->id), Qnil);
7378         }
7379     }
7380   else
7381     {
7382       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7383       val = Fcons (make_number (coding.id), Qnil);
7384     }
7385
7386   /* Then, detect eol-format if necessary.  */
7387   {
7388     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7389     Lisp_Object tail;
7390
7391     if (VECTORP (eol_type))
7392       {
7393         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7394           normal_eol = detect_eol (coding.source, src_bytes,
7395                                    coding_category_raw_text);
7396         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7397                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7398           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7399                                       coding_category_utf_16_be);
7400         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7401                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7402           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7403                                       coding_category_utf_16_le);
7404       }
7405     else
7406       {
7407         if (EQ (eol_type, Qunix))
7408           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7409         else if (EQ (eol_type, Qdos))
7410           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7411         else
7412           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7413       }
7414
7415     for (tail = val; CONSP (tail); tail = XCDR (tail))
7416       {
7417         enum coding_category category;
7418         int this_eol;
7419
7420         id = XINT (XCAR (tail));
7421         attrs = CODING_ID_ATTRS (id);
7422         category = XINT (CODING_ATTR_CATEGORY (attrs));
7423         eol_type = CODING_ID_EOL_TYPE (id);
7424         if (VECTORP (eol_type))
7425           {
7426             if (category == coding_category_utf_16_be
7427                 || category == coding_category_utf_16_be_nosig)
7428               this_eol = utf_16_be_eol;
7429             else if (category == coding_category_utf_16_le
7430                      || category == coding_category_utf_16_le_nosig)
7431               this_eol = utf_16_le_eol;
7432             else
7433               this_eol = normal_eol;
7434
7435             if (this_eol == EOL_SEEN_LF)
7436               XSETCAR (tail, AREF (eol_type, 0));
7437             else if (this_eol == EOL_SEEN_CRLF)
7438               XSETCAR (tail, AREF (eol_type, 1));
7439             else if (this_eol == EOL_SEEN_CR)
7440               XSETCAR (tail, AREF (eol_type, 2));
7441             else
7442               XSETCAR (tail, CODING_ID_NAME (id));
7443           }
7444         else
7445           XSETCAR (tail, CODING_ID_NAME (id));
7446       }
7447   }
7448
7449   return (highest ? XCAR (val) : val);
7450 }
7451
7452
7453 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7454        2, 3, 0,
7455        doc: /* Detect coding system of the text in the region between START and END.
7456 Return a list of possible coding systems ordered by priority.
7457
7458 If only ASCII characters are found, it returns a list of single element
7459 `undecided' or its subsidiary coding system according to a detected
7460 end-of-line format.
7461
7462 If optional argument HIGHEST is non-nil, return the coding system of
7463 highest priority.  */)
7464      (start, end, highest)
7465      Lisp_Object start, end, highest;
7466 {
7467   int from, to;
7468   int from_byte, to_byte;
7469
7470   CHECK_NUMBER_COERCE_MARKER (start);
7471   CHECK_NUMBER_COERCE_MARKER (end);
7472
7473   validate_region (&start, &end);
7474   from = XINT (start), to = XINT (end);
7475   from_byte = CHAR_TO_BYTE (from);
7476   to_byte = CHAR_TO_BYTE (to);
7477
7478   if (from < GPT && to >= GPT)
7479     move_gap_both (to, to_byte);
7480
7481   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7482                                to - from, to_byte - from_byte,
7483                                !NILP (highest),
7484                                !NILP (current_buffer
7485                                       ->enable_multibyte_characters),
7486                                Qnil);
7487 }
7488
7489 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7490        1, 2, 0,
7491        doc: /* Detect coding system of the text in STRING.
7492 Return a list of possible coding systems ordered by priority.
7493
7494 If only ASCII characters are found, it returns a list of single element
7495 `undecided' or its subsidiary coding system according to a detected
7496 end-of-line format.
7497
7498 If optional argument HIGHEST is non-nil, return the coding system of
7499 highest priority.  */)
7500      (string, highest)
7501      Lisp_Object string, highest;
7502 {
7503   CHECK_STRING (string);
7504
7505   return detect_coding_system (SDATA (string),
7506                                SCHARS (string), SBYTES (string),
7507                                !NILP (highest), STRING_MULTIBYTE (string),
7508                                Qnil);
7509 }
7510
7511
7512 static INLINE int
7513 char_encodable_p (c, attrs)
7514      int c;
7515      Lisp_Object attrs;
7516 {
7517   Lisp_Object tail;
7518   struct charset *charset;
7519   Lisp_Object translation_table;
7520
7521   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7522   if (! NILP (translation_table))
7523     c = translate_char (translation_table, c);
7524   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7525        CONSP (tail); tail = XCDR (tail))
7526     {
7527       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7528       if (CHAR_CHARSET_P (c, charset))
7529         break;
7530     }
7531   return (! NILP (tail));
7532 }
7533
7534
7535 /* Return a list of coding systems that safely encode the text between
7536    START and END.  If EXCLUDE is non-nil, it is a list of coding
7537    systems not to check.  The returned list doesn't contain any such
7538    coding systems.  In any case, if the text contains only ASCII or is
7539    unibyte, return t.  */
7540
7541 DEFUN ("find-coding-systems-region-internal",
7542        Ffind_coding_systems_region_internal,
7543        Sfind_coding_systems_region_internal, 2, 3, 0,
7544        doc: /* Internal use only.  */)
7545      (start, end, exclude)
7546      Lisp_Object start, end, exclude;
7547 {
7548   Lisp_Object coding_attrs_list, safe_codings;
7549   EMACS_INT start_byte, end_byte;
7550   const unsigned char *p, *pbeg, *pend;
7551   int c;
7552   Lisp_Object tail, elt;
7553
7554   if (STRINGP (start))
7555     {
7556       if (!STRING_MULTIBYTE (start)
7557           || SCHARS (start) == SBYTES (start))
7558         return Qt;
7559       start_byte = 0;
7560       end_byte = SBYTES (start);
7561     }
7562   else
7563     {
7564       CHECK_NUMBER_COERCE_MARKER (start);
7565       CHECK_NUMBER_COERCE_MARKER (end);
7566       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7567         args_out_of_range (start, end);
7568       if (NILP (current_buffer->enable_multibyte_characters))
7569         return Qt;
7570       start_byte = CHAR_TO_BYTE (XINT (start));
7571       end_byte = CHAR_TO_BYTE (XINT (end));
7572       if (XINT (end) - XINT (start) == end_byte - start_byte)
7573         return Qt;
7574
7575       if (XINT (start) < GPT && XINT (end) > GPT)
7576         {
7577           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7578             move_gap_both (XINT (start), start_byte);
7579           else
7580             move_gap_both (XINT (end), end_byte);
7581         }
7582     }
7583
7584   coding_attrs_list = Qnil;
7585   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7586     if (NILP (exclude)
7587         || NILP (Fmemq (XCAR (tail), exclude)))
7588       {
7589         Lisp_Object attrs;
7590
7591         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7592         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7593             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7594           {
7595             ASET (attrs, coding_attr_trans_tbl,
7596                   get_translation_table (attrs, 1, NULL));
7597             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7598           }
7599       }
7600
7601   if (STRINGP (start))
7602     p = pbeg = SDATA (start);
7603   else
7604     p = pbeg = BYTE_POS_ADDR (start_byte);
7605   pend = p + (end_byte - start_byte);
7606
7607   while (p < pend && ASCII_BYTE_P (*p)) p++;
7608   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7609
7610   while (p < pend)
7611     {
7612       if (ASCII_BYTE_P (*p))
7613         p++;
7614       else
7615         {
7616           c = STRING_CHAR_ADVANCE (p);
7617
7618           charset_map_loaded = 0;
7619           for (tail = coding_attrs_list; CONSP (tail);)
7620             {
7621               elt = XCAR (tail);
7622               if (NILP (elt))
7623                 tail = XCDR (tail);
7624               else if (char_encodable_p (c, elt))
7625                 tail = XCDR (tail);
7626               else if (CONSP (XCDR (tail)))
7627                 {
7628                   XSETCAR (tail, XCAR (XCDR (tail)));
7629                   XSETCDR (tail, XCDR (XCDR (tail)));
7630                 }
7631               else
7632                 {
7633                   XSETCAR (tail, Qnil);
7634                   tail = XCDR (tail);
7635                 }
7636             }
7637           if (charset_map_loaded)
7638             {
7639               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7640
7641               if (STRINGP (start))
7642                 pbeg = SDATA (start);
7643               else
7644                 pbeg = BYTE_POS_ADDR (start_byte);
7645               p = pbeg + p_offset;
7646               pend = pbeg + pend_offset;
7647             }
7648         }
7649     }
7650
7651   safe_codings = list2 (Qraw_text, Qno_conversion);
7652   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7653     if (! NILP (XCAR (tail)))
7654       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7655
7656   return safe_codings;
7657 }
7658
7659
7660 DEFUN ("unencodable-char-position", Funencodable_char_position,
7661        Sunencodable_char_position, 3, 5, 0,
7662        doc: /*
7663 Return position of first un-encodable character in a region.
7664 START and END specfiy the region and CODING-SYSTEM specifies the
7665 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7666
7667 If optional 4th argument COUNT is non-nil, it specifies at most how
7668 many un-encodable characters to search.  In this case, the value is a
7669 list of positions.
7670
7671 If optional 5th argument STRING is non-nil, it is a string to search
7672 for un-encodable characters.  In that case, START and END are indexes
7673 to the string.  */)
7674      (start, end, coding_system, count, string)
7675      Lisp_Object start, end, coding_system, count, string;
7676 {
7677   int n;
7678   struct coding_system coding;
7679   Lisp_Object attrs, charset_list, translation_table;
7680   Lisp_Object positions;
7681   int from, to;
7682   const unsigned char *p, *stop, *pend;
7683   int ascii_compatible;
7684
7685   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7686   attrs = CODING_ID_ATTRS (coding.id);
7687   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7688     return Qnil;
7689   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7690   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7691   translation_table = get_translation_table (attrs, 1, NULL);
7692
7693   if (NILP (string))
7694     {
7695       validate_region (&start, &end);
7696       from = XINT (start);
7697       to = XINT (end);
7698       if (NILP (current_buffer->enable_multibyte_characters)
7699           || (ascii_compatible
7700               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7701         return Qnil;
7702       p = CHAR_POS_ADDR (from);
7703       pend = CHAR_POS_ADDR (to);
7704       if (from < GPT && to >= GPT)
7705         stop = GPT_ADDR;
7706       else
7707         stop = pend;
7708     }
7709   else
7710     {
7711       CHECK_STRING (string);
7712       CHECK_NATNUM (start);
7713       CHECK_NATNUM (end);
7714       from = XINT (start);
7715       to = XINT (end);
7716       if (from > to
7717           || to > SCHARS (string))
7718         args_out_of_range_3 (string, start, end);
7719       if (! STRING_MULTIBYTE (string))
7720         return Qnil;
7721       p = SDATA (string) + string_char_to_byte (string, from);
7722       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7723       if (ascii_compatible && (to - from) == (pend - p))
7724         return Qnil;
7725     }
7726
7727   if (NILP (count))
7728     n = 1;
7729   else
7730     {
7731       CHECK_NATNUM (count);
7732       n = XINT (count);
7733     }
7734
7735   positions = Qnil;
7736   while (1)
7737     {
7738       int c;
7739
7740       if (ascii_compatible)
7741         while (p < stop && ASCII_BYTE_P (*p))
7742           p++, from++;
7743       if (p >= stop)
7744         {
7745           if (p >= pend)
7746             break;
7747           stop = pend;
7748           p = GAP_END_ADDR;
7749         }
7750
7751       c = STRING_CHAR_ADVANCE (p);
7752       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7753           && ! char_charset (translate_char (translation_table, c),
7754                              charset_list, NULL))
7755         {
7756           positions = Fcons (make_number (from), positions);
7757           n--;
7758           if (n == 0)
7759             break;
7760         }
7761
7762       from++;
7763     }
7764
7765   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7766 }
7767
7768
7769 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7770        Scheck_coding_systems_region, 3, 3, 0,
7771        doc: /* Check if the region is encodable by coding systems.
7772
7773 START and END are buffer positions specifying the region.
7774 CODING-SYSTEM-LIST is a list of coding systems to check.
7775
7776 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7777 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7778 whole region, POS0, POS1, ... are buffer positions where non-encodable
7779 characters are found.
7780
7781 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7782 value is nil.
7783
7784 START may be a string.  In that case, check if the string is
7785 encodable, and the value contains indices to the string instead of
7786 buffer positions.  END is ignored.  */)
7787      (start, end, coding_system_list)
7788      Lisp_Object start, end, coding_system_list;
7789 {
7790   Lisp_Object list;
7791   EMACS_INT start_byte, end_byte;
7792   int pos;
7793   const unsigned char *p, *pbeg, *pend;
7794   int c;
7795   Lisp_Object tail, elt, attrs;
7796
7797   if (STRINGP (start))
7798     {
7799       if (!STRING_MULTIBYTE (start)
7800           && SCHARS (start) != SBYTES (start))
7801         return Qnil;
7802       start_byte = 0;
7803       end_byte = SBYTES (start);
7804       pos = 0;
7805     }
7806   else
7807     {
7808       CHECK_NUMBER_COERCE_MARKER (start);
7809       CHECK_NUMBER_COERCE_MARKER (end);
7810       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7811         args_out_of_range (start, end);
7812       if (NILP (current_buffer->enable_multibyte_characters))
7813         return Qnil;
7814       start_byte = CHAR_TO_BYTE (XINT (start));
7815       end_byte = CHAR_TO_BYTE (XINT (end));
7816       if (XINT (end) - XINT (start) == end_byte - start_byte)
7817         return Qt;
7818
7819       if (XINT (start) < GPT && XINT (end) > GPT)
7820         {
7821           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7822             move_gap_both (XINT (start), start_byte);
7823           else
7824             move_gap_both (XINT (end), end_byte);
7825         }
7826       pos = XINT (start);
7827     }
7828
7829   list = Qnil;
7830   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7831     {
7832       elt = XCAR (tail);
7833       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7834       ASET (attrs, coding_attr_trans_tbl,
7835             get_translation_table (attrs, 1, NULL));
7836       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7837     }
7838
7839   if (STRINGP (start))
7840     p = pbeg = SDATA (start);
7841   else
7842     p = pbeg = BYTE_POS_ADDR (start_byte);
7843   pend = p + (end_byte - start_byte);
7844
7845   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7846   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7847
7848   while (p < pend)
7849     {
7850       if (ASCII_BYTE_P (*p))
7851         p++;
7852       else
7853         {
7854           c = STRING_CHAR_ADVANCE (p);
7855
7856           charset_map_loaded = 0;
7857           for (tail = list; CONSP (tail); tail = XCDR (tail))
7858             {
7859               elt = XCDR (XCAR (tail));
7860               if (! char_encodable_p (c, XCAR (elt)))
7861                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7862             }
7863           if (charset_map_loaded)
7864             {
7865               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7866
7867               if (STRINGP (start))
7868                 pbeg = SDATA (start);
7869               else
7870                 pbeg = BYTE_POS_ADDR (start_byte);
7871               p = pbeg + p_offset;
7872               pend = pbeg + pend_offset;
7873             }
7874         }
7875       pos++;
7876     }
7877
7878   tail = list;
7879   list = Qnil;
7880   for (; CONSP (tail); tail = XCDR (tail))
7881     {
7882       elt = XCAR (tail);
7883       if (CONSP (XCDR (XCDR (elt))))
7884         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7885                       list);
7886     }
7887
7888   return list;
7889 }
7890
7891
7892 Lisp_Object
7893 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7894      Lisp_Object start, end, coding_system, dst_object;
7895      int encodep, norecord;
7896 {
7897   struct coding_system coding;
7898   EMACS_INT from, from_byte, to, to_byte;
7899   Lisp_Object src_object;
7900
7901   CHECK_NUMBER_COERCE_MARKER (start);
7902   CHECK_NUMBER_COERCE_MARKER (end);
7903   if (NILP (coding_system))
7904     coding_system = Qno_conversion;
7905   else
7906     CHECK_CODING_SYSTEM (coding_system);
7907   src_object = Fcurrent_buffer ();
7908   if (NILP (dst_object))
7909     dst_object = src_object;
7910   else if (! EQ (dst_object, Qt))
7911     CHECK_BUFFER (dst_object);
7912
7913   validate_region (&start, &end);
7914   from = XFASTINT (start);
7915   from_byte = CHAR_TO_BYTE (from);
7916   to = XFASTINT (end);
7917   to_byte = CHAR_TO_BYTE (to);
7918
7919   setup_coding_system (coding_system, &coding);
7920   coding.mode |= CODING_MODE_LAST_BLOCK;
7921
7922   if (encodep)
7923     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7924                           dst_object);
7925   else
7926     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7927                           dst_object);
7928   if (! norecord)
7929     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7930
7931   return (BUFFERP (dst_object)
7932           ? make_number (coding.produced_char)
7933           : coding.dst_object);
7934 }
7935
7936
7937 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7938        3, 4, "r\nzCoding system: ",
7939        doc: /* Decode the current region from the specified coding system.
7940 When called from a program, takes four arguments:
7941         START, END, CODING-SYSTEM, and DESTINATION.
7942 START and END are buffer positions.
7943
7944 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7945 If nil, the region between START and END is replace by the decoded text.
7946 If buffer, the decoded text is inserted in the buffer.
7947 If t, the decoded text is returned.
7948
7949 This function sets `last-coding-system-used' to the precise coding system
7950 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7951 not fully specified.)
7952 It returns the length of the decoded text.  */)
7953      (start, end, coding_system, destination)
7954      Lisp_Object start, end, coding_system, destination;
7955 {
7956   return code_convert_region (start, end, coding_system, destination, 0, 0);
7957 }
7958
7959 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7960        3, 4, "r\nzCoding system: ",
7961        doc: /* Encode the current region by specified coding system.
7962 When called from a program, takes three arguments:
7963 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7964
7965 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7966 If nil, the region between START and END is replace by the encoded text.
7967 If buffer, the encoded text is inserted in the buffer.
7968 If t, the encoded text is returned.
7969
7970 This function sets `last-coding-system-used' to the precise coding system
7971 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7972 not fully specified.)
7973 It returns the length of the encoded text.  */)
7974   (start, end, coding_system, destination)
7975      Lisp_Object start, end, coding_system, destination;
7976 {
7977   return code_convert_region (start, end, coding_system, destination, 1, 0);
7978 }
7979
7980 Lisp_Object
7981 code_convert_string (string, coding_system, dst_object,
7982                      encodep, nocopy, norecord)
7983      Lisp_Object string, coding_system, dst_object;
7984      int encodep, nocopy, norecord;
7985 {
7986   struct coding_system coding;
7987   EMACS_INT chars, bytes;
7988
7989   CHECK_STRING (string);
7990   if (NILP (coding_system))
7991     {
7992       if (! norecord)
7993         Vlast_coding_system_used = Qno_conversion;
7994       if (NILP (dst_object))
7995         return (nocopy ? Fcopy_sequence (string) : string);
7996     }
7997
7998   if (NILP (coding_system))
7999     coding_system = Qno_conversion;
8000   else
8001     CHECK_CODING_SYSTEM (coding_system);
8002   if (NILP (dst_object))
8003     dst_object = Qt;
8004   else if (! EQ (dst_object, Qt))
8005     CHECK_BUFFER (dst_object);
8006
8007   setup_coding_system (coding_system, &coding);
8008   coding.mode |= CODING_MODE_LAST_BLOCK;
8009   chars = SCHARS (string);
8010   bytes = SBYTES (string);
8011   if (encodep)
8012     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8013   else
8014     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8015   if (! norecord)
8016     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8017
8018   return (BUFFERP (dst_object)
8019           ? make_number (coding.produced_char)
8020           : coding.dst_object);
8021 }
8022
8023
8024 /* Encode or decode STRING according to CODING_SYSTEM.
8025    Do not set Vlast_coding_system_used.
8026
8027    This function is called only from macros DECODE_FILE and
8028    ENCODE_FILE, thus we ignore character composition.  */
8029
8030 Lisp_Object
8031 code_convert_string_norecord (string, coding_system, encodep)
8032      Lisp_Object string, coding_system;
8033      int encodep;
8034 {
8035   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8036 }
8037
8038
8039 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8040        2, 4, 0,
8041        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8042
8043 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8044 if the decoding operation is trivial.
8045
8046 Optional fourth arg BUFFER non-nil meant that the decoded text is
8047 inserted in BUFFER instead of returned as a string.  In this case,
8048 the return value is BUFFER.
8049
8050 This function sets `last-coding-system-used' to the precise coding system
8051 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8052 not fully specified.  */)
8053   (string, coding_system, nocopy, buffer)
8054      Lisp_Object string, coding_system, nocopy, buffer;
8055 {
8056   return code_convert_string (string, coding_system, buffer,
8057                               0, ! NILP (nocopy), 0);
8058 }
8059
8060 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8061        2, 4, 0,
8062        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8063
8064 Optional third arg NOCOPY non-nil means it is OK to return STRING
8065 itself if the encoding operation is trivial.
8066
8067 Optional fourth arg BUFFER non-nil meant that the encoded text is
8068 inserted in BUFFER instead of returned as a string.  In this case,
8069 the return value is BUFFER.
8070
8071 This function sets `last-coding-system-used' to the precise coding system
8072 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8073 not fully specified.)  */)
8074      (string, coding_system, nocopy, buffer)
8075      Lisp_Object string, coding_system, nocopy, buffer;
8076 {
8077   return code_convert_string (string, coding_system, buffer,
8078                               1, ! NILP (nocopy), 1);
8079 }
8080
8081 \f
8082 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8083        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8084 Return the corresponding character.  */)
8085      (code)
8086      Lisp_Object code;
8087 {
8088   Lisp_Object spec, attrs, val;
8089   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8090   int c;
8091
8092   CHECK_NATNUM (code);
8093   c = XFASTINT (code);
8094   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8095   attrs = AREF (spec, 0);
8096
8097   if (ASCII_BYTE_P (c)
8098       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8099     return code;
8100
8101   val = CODING_ATTR_CHARSET_LIST (attrs);
8102   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8103   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8104   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8105
8106   if (c <= 0x7F)
8107     charset = charset_roman;
8108   else if (c >= 0xA0 && c < 0xDF)
8109     {
8110       charset = charset_kana;
8111       c -= 0x80;
8112     }
8113   else
8114     {
8115       int s1 = c >> 8, s2 = c & 0xFF;
8116
8117       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8118           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8119         error ("Invalid code: %d", code);
8120       SJIS_TO_JIS (c);
8121       charset = charset_kanji;
8122     }
8123   c = DECODE_CHAR (charset, c);
8124   if (c < 0)
8125     error ("Invalid code: %d", code);
8126   return make_number (c);
8127 }
8128
8129
8130 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8131        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8132 Return the corresponding code in SJIS.  */)
8133      (ch)
8134     Lisp_Object ch;
8135 {
8136   Lisp_Object spec, attrs, charset_list;
8137   int c;
8138   struct charset *charset;
8139   unsigned code;
8140
8141   CHECK_CHARACTER (ch);
8142   c = XFASTINT (ch);
8143   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8144   attrs = AREF (spec, 0);
8145
8146   if (ASCII_CHAR_P (c)
8147       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8148     return ch;
8149
8150   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8151   charset = char_charset (c, charset_list, &code);
8152   if (code == CHARSET_INVALID_CODE (charset))
8153     error ("Can't encode by shift_jis encoding: %d", c);
8154   JIS_TO_SJIS (code);
8155
8156   return make_number (code);
8157 }
8158
8159 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8160        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8161 Return the corresponding character.  */)
8162      (code)
8163      Lisp_Object code;
8164 {
8165   Lisp_Object spec, attrs, val;
8166   struct charset *charset_roman, *charset_big5, *charset;
8167   int c;
8168
8169   CHECK_NATNUM (code);
8170   c = XFASTINT (code);
8171   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8172   attrs = AREF (spec, 0);
8173
8174   if (ASCII_BYTE_P (c)
8175       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8176     return code;
8177
8178   val = CODING_ATTR_CHARSET_LIST (attrs);
8179   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8180   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8181
8182   if (c <= 0x7F)
8183     charset = charset_roman;
8184   else
8185     {
8186       int b1 = c >> 8, b2 = c & 0x7F;
8187       if (b1 < 0xA1 || b1 > 0xFE
8188           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8189         error ("Invalid code: %d", code);
8190       charset = charset_big5;
8191     }
8192   c = DECODE_CHAR (charset, (unsigned )c);
8193   if (c < 0)
8194     error ("Invalid code: %d", code);
8195   return make_number (c);
8196 }
8197
8198 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8199        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8200 Return the corresponding character code in Big5.  */)
8201      (ch)
8202      Lisp_Object ch;
8203 {
8204   Lisp_Object spec, attrs, charset_list;
8205   struct charset *charset;
8206   int c;
8207   unsigned code;
8208
8209   CHECK_CHARACTER (ch);
8210   c = XFASTINT (ch);
8211   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8212   attrs = AREF (spec, 0);
8213   if (ASCII_CHAR_P (c)
8214       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8215     return ch;
8216
8217   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8218   charset = char_charset (c, charset_list, &code);
8219   if (code == CHARSET_INVALID_CODE (charset))
8220     error ("Can't encode by Big5 encoding: %d", c);
8221
8222   return make_number (code);
8223 }
8224
8225 \f
8226 DEFUN ("set-terminal-coding-system-internal",
8227        Fset_terminal_coding_system_internal,
8228        Sset_terminal_coding_system_internal, 1, 1, 0,
8229        doc: /* Internal use only.  */)
8230      (coding_system)
8231      Lisp_Object coding_system;
8232 {
8233   CHECK_SYMBOL (coding_system);
8234   setup_coding_system (Fcheck_coding_system (coding_system),
8235                         &terminal_coding);
8236
8237   /* We had better not send unsafe characters to terminal.  */
8238   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8239   /* Characer composition should be disabled.  */
8240   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8241   terminal_coding.src_multibyte = 1;
8242   terminal_coding.dst_multibyte = 0;
8243   return Qnil;
8244 }
8245
8246 DEFUN ("set-safe-terminal-coding-system-internal",
8247        Fset_safe_terminal_coding_system_internal,
8248        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8249        doc: /* Internal use only.  */)
8250      (coding_system)
8251      Lisp_Object coding_system;
8252 {
8253   CHECK_SYMBOL (coding_system);
8254   setup_coding_system (Fcheck_coding_system (coding_system),
8255                        &safe_terminal_coding);
8256   /* Characer composition should be disabled.  */
8257   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8258   safe_terminal_coding.src_multibyte = 1;
8259   safe_terminal_coding.dst_multibyte = 0;
8260   return Qnil;
8261 }
8262
8263 DEFUN ("terminal-coding-system",
8264        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8265        doc: /* Return coding system specified for terminal output.  */)
8266      ()
8267 {
8268   Lisp_Object coding_system;
8269
8270   coding_system = CODING_ID_NAME (terminal_coding.id);
8271   /* For backward compatibility, return nil if it is `undecided'. */
8272   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8273 }
8274
8275 DEFUN ("set-keyboard-coding-system-internal",
8276        Fset_keyboard_coding_system_internal,
8277        Sset_keyboard_coding_system_internal, 1, 1, 0,
8278        doc: /* Internal use only.  */)
8279      (coding_system)
8280      Lisp_Object coding_system;
8281 {
8282   CHECK_SYMBOL (coding_system);
8283   setup_coding_system (Fcheck_coding_system (coding_system),
8284                        &keyboard_coding);
8285   /* Characer composition should be disabled.  */
8286   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8287   return Qnil;
8288 }
8289
8290 DEFUN ("keyboard-coding-system",
8291        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8292        doc: /* Return coding system specified for decoding keyboard input.  */)
8293      ()
8294 {
8295   return CODING_ID_NAME (keyboard_coding.id);
8296 }
8297
8298 \f
8299 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8300        Sfind_operation_coding_system,  1, MANY, 0,
8301        doc: /* Choose a coding system for an operation based on the target name.
8302 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8303 DECODING-SYSTEM is the coding system to use for decoding
8304 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8305 for encoding (in case OPERATION does encoding).
8306
8307 The first argument OPERATION specifies an I/O primitive:
8308   For file I/O, `insert-file-contents' or `write-region'.
8309   For process I/O, `call-process', `call-process-region', or `start-process'.
8310   For network I/O, `open-network-stream'.
8311
8312 The remaining arguments should be the same arguments that were passed
8313 to the primitive.  Depending on which primitive, one of those arguments
8314 is selected as the TARGET.  For example, if OPERATION does file I/O,
8315 whichever argument specifies the file name is TARGET.
8316
8317 TARGET has a meaning which depends on OPERATION:
8318   For file I/O, TARGET is a file name.
8319   For process I/O, TARGET is a process name.
8320   For network I/O, TARGET is a service name or a port number
8321
8322 This function looks up what specified for TARGET in,
8323 `file-coding-system-alist', `process-coding-system-alist',
8324 or `network-coding-system-alist' depending on OPERATION.
8325 They may specify a coding system, a cons of coding systems,
8326 or a function symbol to call.
8327 In the last case, we call the function with one argument,
8328 which is a list of all the arguments given to this function.
8329
8330 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8331      (nargs, args)
8332      int nargs;
8333      Lisp_Object *args;
8334 {
8335   Lisp_Object operation, target_idx, target, val;
8336   register Lisp_Object chain;
8337
8338   if (nargs < 2)
8339     error ("Too few arguments");
8340   operation = args[0];
8341   if (!SYMBOLP (operation)
8342       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8343     error ("Invalid first arguement");
8344   if (nargs < 1 + XINT (target_idx))
8345     error ("Too few arguments for operation: %s",
8346            SDATA (SYMBOL_NAME (operation)));
8347   target = args[XINT (target_idx) + 1];
8348   if (!(STRINGP (target)
8349         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8350     error ("Invalid %dth argument", XINT (target_idx) + 1);
8351
8352   chain = ((EQ (operation, Qinsert_file_contents)
8353             || EQ (operation, Qwrite_region))
8354            ? Vfile_coding_system_alist
8355            : (EQ (operation, Qopen_network_stream)
8356               ? Vnetwork_coding_system_alist
8357               : Vprocess_coding_system_alist));
8358   if (NILP (chain))
8359     return Qnil;
8360
8361   for (; CONSP (chain); chain = XCDR (chain))
8362     {
8363       Lisp_Object elt;
8364
8365       elt = XCAR (chain);
8366       if (CONSP (elt)
8367           && ((STRINGP (target)
8368                && STRINGP (XCAR (elt))
8369                && fast_string_match (XCAR (elt), target) >= 0)
8370               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8371         {
8372           val = XCDR (elt);
8373           /* Here, if VAL is both a valid coding system and a valid
8374              function symbol, we return VAL as a coding system.  */
8375           if (CONSP (val))
8376             return val;
8377           if (! SYMBOLP (val))
8378             return Qnil;
8379           if (! NILP (Fcoding_system_p (val)))
8380             return Fcons (val, val);
8381           if (! NILP (Ffboundp (val)))
8382             {
8383               val = call1 (val, Flist (nargs, args));
8384               if (CONSP (val))
8385                 return val;
8386               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8387                 return Fcons (val, val);
8388             }
8389           return Qnil;
8390         }
8391     }
8392   return Qnil;
8393 }
8394
8395 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8396        Sset_coding_system_priority, 0, MANY, 0,
8397        doc: /* Assign higher priority to the coding systems given as arguments.
8398 If multiple coding systems belongs to the same category,
8399 all but the first one are ignored.
8400
8401 usage: (set-coding-system-priority ...)  */)
8402      (nargs, args)
8403      int nargs;
8404      Lisp_Object *args;
8405 {
8406   int i, j;
8407   int changed[coding_category_max];
8408   enum coding_category priorities[coding_category_max];
8409
8410   bzero (changed, sizeof changed);
8411
8412   for (i = j = 0; i < nargs; i++)
8413     {
8414       enum coding_category category;
8415       Lisp_Object spec, attrs;
8416
8417       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8418       attrs = AREF (spec, 0);
8419       category = XINT (CODING_ATTR_CATEGORY (attrs));
8420       if (changed[category])
8421         /* Ignore this coding system because a coding system of the
8422            same category already had a higher priority.  */
8423         continue;
8424       changed[category] = 1;
8425       priorities[j++] = category;
8426       if (coding_categories[category].id >= 0
8427           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8428         setup_coding_system (args[i], &coding_categories[category]);
8429       Fset (AREF (Vcoding_category_table, category), args[i]);
8430     }
8431
8432   /* Now we have decided top J priorities.  Reflect the order of the
8433      original priorities to the remaining priorities.  */
8434
8435   for (i = j, j = 0; i < coding_category_max; i++, j++)
8436     {
8437       while (j < coding_category_max
8438              && changed[coding_priorities[j]])
8439         j++;
8440       if (j == coding_category_max)
8441         abort ();
8442       priorities[i] = coding_priorities[j];
8443     }
8444
8445   bcopy (priorities, coding_priorities, sizeof priorities);
8446
8447   /* Update `coding-category-list'.  */
8448   Vcoding_category_list = Qnil;
8449   for (i = coding_category_max - 1; i >= 0; i--)
8450     Vcoding_category_list
8451       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8452                Vcoding_category_list);
8453
8454   return Qnil;
8455 }
8456
8457 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8458        Scoding_system_priority_list, 0, 1, 0,
8459        doc: /* Return a list of coding systems ordered by their priorities.
8460 HIGHESTP non-nil means just return the highest priority one.  */)
8461      (highestp)
8462      Lisp_Object highestp;
8463 {
8464   int i;
8465   Lisp_Object val;
8466
8467   for (i = 0, val = Qnil; i < coding_category_max; i++)
8468     {
8469       enum coding_category category = coding_priorities[i];
8470       int id = coding_categories[category].id;
8471       Lisp_Object attrs;
8472
8473       if (id < 0)
8474         continue;
8475       attrs = CODING_ID_ATTRS (id);
8476       if (! NILP (highestp))
8477         return CODING_ATTR_BASE_NAME (attrs);
8478       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8479     }
8480   return Fnreverse (val);
8481 }
8482
8483 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8484
8485 static Lisp_Object
8486 make_subsidiaries (base)
8487      Lisp_Object base;
8488 {
8489   Lisp_Object subsidiaries;
8490   int base_name_len = SBYTES (SYMBOL_NAME (base));
8491   char *buf = (char *) alloca (base_name_len + 6);
8492   int i;
8493
8494   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8495   subsidiaries = Fmake_vector (make_number (3), Qnil);
8496   for (i = 0; i < 3; i++)
8497     {
8498       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8499       ASET (subsidiaries, i, intern (buf));
8500     }
8501   return subsidiaries;
8502 }
8503
8504
8505 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8506        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8507        doc: /* For internal use only.
8508 usage: (define-coding-system-internal ...)  */)
8509      (nargs, args)
8510      int nargs;
8511      Lisp_Object *args;
8512 {
8513   Lisp_Object name;
8514   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8515   Lisp_Object attrs;            /* Vector of attributes.  */
8516   Lisp_Object eol_type;
8517   Lisp_Object aliases;
8518   Lisp_Object coding_type, charset_list, safe_charsets;
8519   enum coding_category category;
8520   Lisp_Object tail, val;
8521   int max_charset_id = 0;
8522   int i;
8523
8524   if (nargs < coding_arg_max)
8525     goto short_args;
8526
8527   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8528
8529   name = args[coding_arg_name];
8530   CHECK_SYMBOL (name);
8531   CODING_ATTR_BASE_NAME (attrs) = name;
8532
8533   val = args[coding_arg_mnemonic];
8534   if (! STRINGP (val))
8535     CHECK_CHARACTER (val);
8536   CODING_ATTR_MNEMONIC (attrs) = val;
8537
8538   coding_type = args[coding_arg_coding_type];
8539   CHECK_SYMBOL (coding_type);
8540   CODING_ATTR_TYPE (attrs) = coding_type;
8541
8542   charset_list = args[coding_arg_charset_list];
8543   if (SYMBOLP (charset_list))
8544     {
8545       if (EQ (charset_list, Qiso_2022))
8546         {
8547           if (! EQ (coding_type, Qiso_2022))
8548             error ("Invalid charset-list");
8549           charset_list = Viso_2022_charset_list;
8550         }
8551       else if (EQ (charset_list, Qemacs_mule))
8552         {
8553           if (! EQ (coding_type, Qemacs_mule))
8554             error ("Invalid charset-list");
8555           charset_list = Vemacs_mule_charset_list;
8556         }
8557       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8558         if (max_charset_id < XFASTINT (XCAR (tail)))
8559           max_charset_id = XFASTINT (XCAR (tail));
8560     }
8561   else
8562     {
8563       charset_list = Fcopy_sequence (charset_list);
8564       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8565         {
8566           struct charset *charset;
8567
8568           val = Fcar (tail);
8569           CHECK_CHARSET_GET_CHARSET (val, charset);
8570           if (EQ (coding_type, Qiso_2022)
8571               ? CHARSET_ISO_FINAL (charset) < 0
8572               : EQ (coding_type, Qemacs_mule)
8573               ? CHARSET_EMACS_MULE_ID (charset) < 0
8574               : 0)
8575             error ("Can't handle charset `%s'",
8576                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8577
8578           XSETCAR (tail, make_number (charset->id));
8579           if (max_charset_id < charset->id)
8580             max_charset_id = charset->id;
8581         }
8582     }
8583   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8584
8585   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8586                                 make_number (255));
8587   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8588     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8589   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8590
8591   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8592
8593   val = args[coding_arg_decode_translation_table];
8594   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8595     CHECK_SYMBOL (val);
8596   CODING_ATTR_DECODE_TBL (attrs) = val;
8597
8598   val = args[coding_arg_encode_translation_table];
8599   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8600     CHECK_SYMBOL (val);
8601   CODING_ATTR_ENCODE_TBL (attrs) = val;
8602
8603   val = args[coding_arg_post_read_conversion];
8604   CHECK_SYMBOL (val);
8605   CODING_ATTR_POST_READ (attrs) = val;
8606
8607   val = args[coding_arg_pre_write_conversion];
8608   CHECK_SYMBOL (val);
8609   CODING_ATTR_PRE_WRITE (attrs) = val;
8610
8611   val = args[coding_arg_default_char];
8612   if (NILP (val))
8613     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8614   else
8615     {
8616       CHECK_CHARACTER (val);
8617       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8618     }
8619
8620   val = args[coding_arg_for_unibyte];
8621   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8622
8623   val = args[coding_arg_plist];
8624   CHECK_LIST (val);
8625   CODING_ATTR_PLIST (attrs) = val;
8626
8627   if (EQ (coding_type, Qcharset))
8628     {
8629       /* Generate a lisp vector of 256 elements.  Each element is nil,
8630          integer, or a list of charset IDs.
8631
8632          If Nth element is nil, the byte code N is invalid in this
8633          coding system.
8634
8635          If Nth element is a number NUM, N is the first byte of a
8636          charset whose ID is NUM.
8637
8638          If Nth element is a list of charset IDs, N is the first byte
8639          of one of them.  The list is sorted by dimensions of the
8640          charsets.  A charset of smaller dimension comes firtst. */
8641       val = Fmake_vector (make_number (256), Qnil);
8642
8643       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8644         {
8645           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8646           int dim = CHARSET_DIMENSION (charset);
8647           int idx = (dim - 1) * 4;
8648
8649           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8650             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8651
8652           for (i = charset->code_space[idx];
8653                i <= charset->code_space[idx + 1]; i++)
8654             {
8655               Lisp_Object tmp, tmp2;
8656               int dim2;
8657
8658               tmp = AREF (val, i);
8659               if (NILP (tmp))
8660                 tmp = XCAR (tail);
8661               else if (NUMBERP (tmp))
8662                 {
8663                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8664                   if (dim < dim2)
8665                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8666                   else
8667                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8668                 }
8669               else
8670                 {
8671                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8672                     {
8673                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8674                       if (dim < dim2)
8675                         break;
8676                     }
8677                   if (NILP (tmp2))
8678                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8679                   else
8680                     {
8681                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8682                       XSETCAR (tmp2, XCAR (tail));
8683                     }
8684                 }
8685               ASET (val, i, tmp);
8686             }
8687         }
8688       ASET (attrs, coding_attr_charset_valids, val);
8689       category = coding_category_charset;
8690     }
8691   else if (EQ (coding_type, Qccl))
8692     {
8693       Lisp_Object valids;
8694
8695       if (nargs < coding_arg_ccl_max)
8696         goto short_args;
8697
8698       val = args[coding_arg_ccl_decoder];
8699       CHECK_CCL_PROGRAM (val);
8700       if (VECTORP (val))
8701         val = Fcopy_sequence (val);
8702       ASET (attrs, coding_attr_ccl_decoder, val);
8703
8704       val = args[coding_arg_ccl_encoder];
8705       CHECK_CCL_PROGRAM (val);
8706       if (VECTORP (val))
8707         val = Fcopy_sequence (val);
8708       ASET (attrs, coding_attr_ccl_encoder, val);
8709
8710       val = args[coding_arg_ccl_valids];
8711       valids = Fmake_string (make_number (256), make_number (0));
8712       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8713         {
8714           int from, to;
8715
8716           val = Fcar (tail);
8717           if (INTEGERP (val))
8718             {
8719               from = to = XINT (val);
8720               if (from < 0 || from > 255)
8721                 args_out_of_range_3 (val, make_number (0), make_number (255));
8722             }
8723           else
8724             {
8725               CHECK_CONS (val);
8726               CHECK_NATNUM_CAR (val);
8727               CHECK_NATNUM_CDR (val);
8728               from = XINT (XCAR (val));
8729               if (from > 255)
8730                 args_out_of_range_3 (XCAR (val),
8731                                      make_number (0), make_number (255));
8732               to = XINT (XCDR (val));
8733               if (to < from || to > 255)
8734                 args_out_of_range_3 (XCDR (val),
8735                                      XCAR (val), make_number (255));
8736             }
8737           for (i = from; i <= to; i++)
8738             SSET (valids, i, 1);
8739         }
8740       ASET (attrs, coding_attr_ccl_valids, valids);
8741
8742       category = coding_category_ccl;
8743     }
8744   else if (EQ (coding_type, Qutf_16))
8745     {
8746       Lisp_Object bom, endian;
8747
8748       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8749
8750       if (nargs < coding_arg_utf16_max)
8751         goto short_args;
8752
8753       bom = args[coding_arg_utf16_bom];
8754       if (! NILP (bom) && ! EQ (bom, Qt))
8755         {
8756           CHECK_CONS (bom);
8757           val = XCAR (bom);
8758           CHECK_CODING_SYSTEM (val);
8759           val = XCDR (bom);
8760           CHECK_CODING_SYSTEM (val);
8761         }
8762       ASET (attrs, coding_attr_utf_16_bom, bom);
8763
8764       endian = args[coding_arg_utf16_endian];
8765       CHECK_SYMBOL (endian);
8766       if (NILP (endian))
8767         endian = Qbig;
8768       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8769         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8770       ASET (attrs, coding_attr_utf_16_endian, endian);
8771
8772       category = (CONSP (bom)
8773                   ? coding_category_utf_16_auto
8774                   : NILP (bom)
8775                   ? (EQ (endian, Qbig)
8776                      ? coding_category_utf_16_be_nosig
8777                      : coding_category_utf_16_le_nosig)
8778                   : (EQ (endian, Qbig)
8779                      ? coding_category_utf_16_be
8780                      : coding_category_utf_16_le));
8781     }
8782   else if (EQ (coding_type, Qiso_2022))
8783     {
8784       Lisp_Object initial, reg_usage, request, flags;
8785       int i;
8786
8787       if (nargs < coding_arg_iso2022_max)
8788         goto short_args;
8789
8790       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8791       CHECK_VECTOR (initial);
8792       for (i = 0; i < 4; i++)
8793         {
8794           val = Faref (initial, make_number (i));
8795           if (! NILP (val))
8796             {
8797               struct charset *charset;
8798
8799               CHECK_CHARSET_GET_CHARSET (val, charset);
8800               ASET (initial, i, make_number (CHARSET_ID (charset)));
8801               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8802                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8803             }
8804           else
8805             ASET (initial, i, make_number (-1));
8806         }
8807
8808       reg_usage = args[coding_arg_iso2022_reg_usage];
8809       CHECK_CONS (reg_usage);
8810       CHECK_NUMBER_CAR (reg_usage);
8811       CHECK_NUMBER_CDR (reg_usage);
8812
8813       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8814       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8815         {
8816           int id;
8817           Lisp_Object tmp;
8818
8819           val = Fcar (tail);
8820           CHECK_CONS (val);
8821           tmp = XCAR (val);
8822           CHECK_CHARSET_GET_ID (tmp, id);
8823           CHECK_NATNUM_CDR (val);
8824           if (XINT (XCDR (val)) >= 4)
8825             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8826           XSETCAR (val, make_number (id));
8827         }
8828
8829       flags = args[coding_arg_iso2022_flags];
8830       CHECK_NATNUM (flags);
8831       i = XINT (flags);
8832       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8833         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8834
8835       ASET (attrs, coding_attr_iso_initial, initial);
8836       ASET (attrs, coding_attr_iso_usage, reg_usage);
8837       ASET (attrs, coding_attr_iso_request, request);
8838       ASET (attrs, coding_attr_iso_flags, flags);
8839       setup_iso_safe_charsets (attrs);
8840
8841       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8842         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8843                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8844                     ? coding_category_iso_7_else
8845                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8846                     ? coding_category_iso_7
8847                     : coding_category_iso_7_tight);
8848       else
8849         {
8850           int id = XINT (AREF (initial, 1));
8851
8852           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8853                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8854                        || id < 0)
8855                       ? coding_category_iso_8_else
8856                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8857                       ? coding_category_iso_8_1
8858                       : coding_category_iso_8_2);
8859         }
8860       if (category != coding_category_iso_8_1
8861           && category != coding_category_iso_8_2)
8862         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8863     }
8864   else if (EQ (coding_type, Qemacs_mule))
8865     {
8866       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8867         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8868       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8869       category = coding_category_emacs_mule;
8870     }
8871   else if (EQ (coding_type, Qshift_jis))
8872     {
8873
8874       struct charset *charset;
8875
8876       if (XINT (Flength (charset_list)) != 3
8877           && XINT (Flength (charset_list)) != 4)
8878         error ("There should be three or four charsets");
8879
8880       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8881       if (CHARSET_DIMENSION (charset) != 1)
8882         error ("Dimension of charset %s is not one",
8883                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8884       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8885         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8886
8887       charset_list = XCDR (charset_list);
8888       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8889       if (CHARSET_DIMENSION (charset) != 1)
8890         error ("Dimension of charset %s is not one",
8891                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8892
8893       charset_list = XCDR (charset_list);
8894       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8895       if (CHARSET_DIMENSION (charset) != 2)
8896         error ("Dimension of charset %s is not two",
8897                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8898
8899       charset_list = XCDR (charset_list);
8900       if (! NILP (charset_list))
8901         {
8902           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8903           if (CHARSET_DIMENSION (charset) != 2)
8904             error ("Dimension of charset %s is not two",
8905                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8906         }
8907
8908       category = coding_category_sjis;
8909       Vsjis_coding_system = name;
8910     }
8911   else if (EQ (coding_type, Qbig5))
8912     {
8913       struct charset *charset;
8914
8915       if (XINT (Flength (charset_list)) != 2)
8916         error ("There should be just two charsets");
8917
8918       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8919       if (CHARSET_DIMENSION (charset) != 1)
8920         error ("Dimension of charset %s is not one",
8921                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8922       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8923         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8924
8925       charset_list = XCDR (charset_list);
8926       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8927       if (CHARSET_DIMENSION (charset) != 2)
8928         error ("Dimension of charset %s is not two",
8929                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8930
8931       category = coding_category_big5;
8932       Vbig5_coding_system = name;
8933     }
8934   else if (EQ (coding_type, Qraw_text))
8935     {
8936       category = coding_category_raw_text;
8937       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8938     }
8939   else if (EQ (coding_type, Qutf_8))
8940     {
8941       category = coding_category_utf_8;
8942       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8943     }
8944   else if (EQ (coding_type, Qundecided))
8945     category = coding_category_undecided;
8946   else
8947     error ("Invalid coding system type: %s",
8948            SDATA (SYMBOL_NAME (coding_type)));
8949
8950   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8951   CODING_ATTR_PLIST (attrs)
8952     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8953                                 CODING_ATTR_PLIST (attrs)));
8954   CODING_ATTR_PLIST (attrs)
8955     = Fcons (QCascii_compatible_p,
8956              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8957                     CODING_ATTR_PLIST (attrs)));
8958
8959   eol_type = args[coding_arg_eol_type];
8960   if (! NILP (eol_type)
8961       && ! EQ (eol_type, Qunix)
8962       && ! EQ (eol_type, Qdos)
8963       && ! EQ (eol_type, Qmac))
8964     error ("Invalid eol-type");
8965
8966   aliases = Fcons (name, Qnil);
8967
8968   if (NILP (eol_type))
8969     {
8970       eol_type = make_subsidiaries (name);
8971       for (i = 0; i < 3; i++)
8972         {
8973           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8974
8975           this_name = AREF (eol_type, i);
8976           this_aliases = Fcons (this_name, Qnil);
8977           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8978           this_spec = Fmake_vector (make_number (3), attrs);
8979           ASET (this_spec, 1, this_aliases);
8980           ASET (this_spec, 2, this_eol_type);
8981           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8982           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8983           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8984           if (NILP (val))
8985             Vcoding_system_alist
8986               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8987                        Vcoding_system_alist);
8988         }
8989     }
8990
8991   spec_vec = Fmake_vector (make_number (3), attrs);
8992   ASET (spec_vec, 1, aliases);
8993   ASET (spec_vec, 2, eol_type);
8994
8995   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8996   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8997   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8998   if (NILP (val))
8999     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
9000                                   Vcoding_system_alist);
9001
9002   {
9003     int id = coding_categories[category].id;
9004
9005     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
9006       setup_coding_system (name, &coding_categories[category]);
9007   }
9008
9009   return Qnil;
9010
9011  short_args:
9012   return Fsignal (Qwrong_number_of_arguments,
9013                   Fcons (intern ("define-coding-system-internal"),
9014                          make_number (nargs)));
9015 }
9016
9017
9018 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9019        3, 3, 0,
9020        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9021   (coding_system, prop, val)
9022      Lisp_Object coding_system, prop, val;
9023 {
9024   Lisp_Object spec, attrs;
9025
9026   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9027   attrs = AREF (spec, 0);
9028   if (EQ (prop, QCmnemonic))
9029     {
9030       if (! STRINGP (val))
9031         CHECK_CHARACTER (val);
9032       CODING_ATTR_MNEMONIC (attrs) = val;
9033     }
9034   else if (EQ (prop, QCdefalut_char))
9035     {
9036       if (NILP (val))
9037         val = make_number (' ');
9038       else
9039         CHECK_CHARACTER (val);
9040       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9041     }
9042   else if (EQ (prop, QCdecode_translation_table))
9043     {
9044       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9045         CHECK_SYMBOL (val);
9046       CODING_ATTR_DECODE_TBL (attrs) = val;
9047     }
9048   else if (EQ (prop, QCencode_translation_table))
9049     {
9050       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9051         CHECK_SYMBOL (val);
9052       CODING_ATTR_ENCODE_TBL (attrs) = val;
9053     }
9054   else if (EQ (prop, QCpost_read_conversion))
9055     {
9056       CHECK_SYMBOL (val);
9057       CODING_ATTR_POST_READ (attrs) = val;
9058     }
9059   else if (EQ (prop, QCpre_write_conversion))
9060     {
9061       CHECK_SYMBOL (val);
9062       CODING_ATTR_PRE_WRITE (attrs) = val;
9063     }
9064   else if (EQ (prop, QCascii_compatible_p))
9065     {
9066       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9067     }
9068
9069   CODING_ATTR_PLIST (attrs)
9070     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9071   return val;
9072 }
9073
9074
9075 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9076        Sdefine_coding_system_alias, 2, 2, 0,
9077        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9078      (alias, coding_system)
9079      Lisp_Object alias, coding_system;
9080 {
9081   Lisp_Object spec, aliases, eol_type, val;
9082
9083   CHECK_SYMBOL (alias);
9084   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9085   aliases = AREF (spec, 1);
9086   /* ALISES should be a list of length more than zero, and the first
9087      element is a base coding system.  Append ALIAS at the tail of the
9088      list.  */
9089   while (!NILP (XCDR (aliases)))
9090     aliases = XCDR (aliases);
9091   XSETCDR (aliases, Fcons (alias, Qnil));
9092
9093   eol_type = AREF (spec, 2);
9094   if (VECTORP (eol_type))
9095     {
9096       Lisp_Object subsidiaries;
9097       int i;
9098
9099       subsidiaries = make_subsidiaries (alias);
9100       for (i = 0; i < 3; i++)
9101         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9102                                      AREF (eol_type, i));
9103     }
9104
9105   Fputhash (alias, spec, Vcoding_system_hash_table);
9106   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9107   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9108   if (NILP (val))
9109     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9110                                   Vcoding_system_alist);
9111
9112   return Qnil;
9113 }
9114
9115 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9116        1, 1, 0,
9117        doc: /* Return the base of CODING-SYSTEM.
9118 Any alias or subsidiary coding system is not a base coding system.  */)
9119   (coding_system)
9120      Lisp_Object coding_system;
9121 {
9122   Lisp_Object spec, attrs;
9123
9124   if (NILP (coding_system))
9125     return (Qno_conversion);
9126   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9127   attrs = AREF (spec, 0);
9128   return CODING_ATTR_BASE_NAME (attrs);
9129 }
9130
9131 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9132        1, 1, 0,
9133        doc: "Return the property list of CODING-SYSTEM.")
9134      (coding_system)
9135      Lisp_Object coding_system;
9136 {
9137   Lisp_Object spec, attrs;
9138
9139   if (NILP (coding_system))
9140     coding_system = Qno_conversion;
9141   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9142   attrs = AREF (spec, 0);
9143   return CODING_ATTR_PLIST (attrs);
9144 }
9145
9146
9147 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9148        1, 1, 0,
9149        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9150      (coding_system)
9151      Lisp_Object coding_system;
9152 {
9153   Lisp_Object spec;
9154
9155   if (NILP (coding_system))
9156     coding_system = Qno_conversion;
9157   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9158   return AREF (spec, 1);
9159 }
9160
9161 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9162        Scoding_system_eol_type, 1, 1, 0,
9163        doc: /* Return eol-type of CODING-SYSTEM.
9164 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9165
9166 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9167 and CR respectively.
9168
9169 A vector value indicates that a format of end-of-line should be
9170 detected automatically.  Nth element of the vector is the subsidiary
9171 coding system whose eol-type is N.  */)
9172      (coding_system)
9173      Lisp_Object coding_system;
9174 {
9175   Lisp_Object spec, eol_type;
9176   int n;
9177
9178   if (NILP (coding_system))
9179     coding_system = Qno_conversion;
9180   if (! CODING_SYSTEM_P (coding_system))
9181     return Qnil;
9182   spec = CODING_SYSTEM_SPEC (coding_system);
9183   eol_type = AREF (spec, 2);
9184   if (VECTORP (eol_type))
9185     return Fcopy_sequence (eol_type);
9186   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9187   return make_number (n);
9188 }
9189
9190 #endif /* emacs */
9191
9192 \f
9193 /*** 9. Post-amble ***/
9194
9195 void
9196 init_coding_once ()
9197 {
9198   int i;
9199
9200   for (i = 0; i < coding_category_max; i++)
9201     {
9202       coding_categories[i].id = -1;
9203       coding_priorities[i] = i;
9204     }
9205
9206   /* ISO2022 specific initialize routine.  */
9207   for (i = 0; i < 0x20; i++)
9208     iso_code_class[i] = ISO_control_0;
9209   for (i = 0x21; i < 0x7F; i++)
9210     iso_code_class[i] = ISO_graphic_plane_0;
9211   for (i = 0x80; i < 0xA0; i++)
9212     iso_code_class[i] = ISO_control_1;
9213   for (i = 0xA1; i < 0xFF; i++)
9214     iso_code_class[i] = ISO_graphic_plane_1;
9215   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9216   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9217   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9218   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9219   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9220   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9221   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9222   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9223   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9224
9225   for (i = 0; i < 256; i++)
9226     {
9227       emacs_mule_bytes[i] = 1;
9228     }
9229   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9230   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9231   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9232   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9233 }
9234
9235 #ifdef emacs
9236
9237 void
9238 syms_of_coding ()
9239 {
9240   staticpro (&Vcoding_system_hash_table);
9241   {
9242     Lisp_Object args[2];
9243     args[0] = QCtest;
9244     args[1] = Qeq;
9245     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9246   }
9247
9248   staticpro (&Vsjis_coding_system);
9249   Vsjis_coding_system = Qnil;
9250
9251   staticpro (&Vbig5_coding_system);
9252   Vbig5_coding_system = Qnil;
9253
9254   staticpro (&Vcode_conversion_reused_workbuf);
9255   Vcode_conversion_reused_workbuf = Qnil;
9256
9257   staticpro (&Vcode_conversion_workbuf_name);
9258   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9259
9260   reused_workbuf_in_use = 0;
9261
9262   DEFSYM (Qcharset, "charset");
9263   DEFSYM (Qtarget_idx, "target-idx");
9264   DEFSYM (Qcoding_system_history, "coding-system-history");
9265   Fset (Qcoding_system_history, Qnil);
9266
9267   /* Target FILENAME is the first argument.  */
9268   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9269   /* Target FILENAME is the third argument.  */
9270   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9271
9272   DEFSYM (Qcall_process, "call-process");
9273   /* Target PROGRAM is the first argument.  */
9274   Fput (Qcall_process, Qtarget_idx, make_number (0));
9275
9276   DEFSYM (Qcall_process_region, "call-process-region");
9277   /* Target PROGRAM is the third argument.  */
9278   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9279
9280   DEFSYM (Qstart_process, "start-process");
9281   /* Target PROGRAM is the third argument.  */
9282   Fput (Qstart_process, Qtarget_idx, make_number (2));
9283
9284   DEFSYM (Qopen_network_stream, "open-network-stream");
9285   /* Target SERVICE is the fourth argument.  */
9286   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9287
9288   DEFSYM (Qcoding_system, "coding-system");
9289   DEFSYM (Qcoding_aliases, "coding-aliases");
9290
9291   DEFSYM (Qeol_type, "eol-type");
9292   DEFSYM (Qunix, "unix");
9293   DEFSYM (Qdos, "dos");
9294
9295   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9296   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9297   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9298   DEFSYM (Qdefault_char, "default-char");
9299   DEFSYM (Qundecided, "undecided");
9300   DEFSYM (Qno_conversion, "no-conversion");
9301   DEFSYM (Qraw_text, "raw-text");
9302
9303   DEFSYM (Qiso_2022, "iso-2022");
9304
9305   DEFSYM (Qutf_8, "utf-8");
9306   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9307
9308   DEFSYM (Qutf_16, "utf-16");
9309   DEFSYM (Qbig, "big");
9310   DEFSYM (Qlittle, "little");
9311
9312   DEFSYM (Qshift_jis, "shift-jis");
9313   DEFSYM (Qbig5, "big5");
9314
9315   DEFSYM (Qcoding_system_p, "coding-system-p");
9316
9317   DEFSYM (Qcoding_system_error, "coding-system-error");
9318   Fput (Qcoding_system_error, Qerror_conditions,
9319         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9320   Fput (Qcoding_system_error, Qerror_message,
9321         build_string ("Invalid coding system"));
9322
9323   /* Intern this now in case it isn't already done.
9324      Setting this variable twice is harmless.
9325      But don't staticpro it here--that is done in alloc.c.  */
9326   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9327
9328   DEFSYM (Qtranslation_table, "translation-table");
9329   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9330   DEFSYM (Qtranslation_table_id, "translation-table-id");
9331   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9332   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9333
9334   DEFSYM (Qvalid_codes, "valid-codes");
9335
9336   DEFSYM (Qemacs_mule, "emacs-mule");
9337
9338   DEFSYM (QCcategory, ":category");
9339   DEFSYM (QCmnemonic, ":mnemonic");
9340   DEFSYM (QCdefalut_char, ":default-char");
9341   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9342   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9343   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9344   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9345   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9346
9347   Vcoding_category_table
9348     = Fmake_vector (make_number (coding_category_max), Qnil);
9349   staticpro (&Vcoding_category_table);
9350   /* Followings are target of code detection.  */
9351   ASET (Vcoding_category_table, coding_category_iso_7,
9352         intern ("coding-category-iso-7"));
9353   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9354         intern ("coding-category-iso-7-tight"));
9355   ASET (Vcoding_category_table, coding_category_iso_8_1,
9356         intern ("coding-category-iso-8-1"));
9357   ASET (Vcoding_category_table, coding_category_iso_8_2,
9358         intern ("coding-category-iso-8-2"));
9359   ASET (Vcoding_category_table, coding_category_iso_7_else,
9360         intern ("coding-category-iso-7-else"));
9361   ASET (Vcoding_category_table, coding_category_iso_8_else,
9362         intern ("coding-category-iso-8-else"));
9363   ASET (Vcoding_category_table, coding_category_utf_8,
9364         intern ("coding-category-utf-8"));
9365   ASET (Vcoding_category_table, coding_category_utf_16_be,
9366         intern ("coding-category-utf-16-be"));
9367   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9368         intern ("coding-category-utf-16-auto"));
9369   ASET (Vcoding_category_table, coding_category_utf_16_le,
9370         intern ("coding-category-utf-16-le"));
9371   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9372         intern ("coding-category-utf-16-be-nosig"));
9373   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9374         intern ("coding-category-utf-16-le-nosig"));
9375   ASET (Vcoding_category_table, coding_category_charset,
9376         intern ("coding-category-charset"));
9377   ASET (Vcoding_category_table, coding_category_sjis,
9378         intern ("coding-category-sjis"));
9379   ASET (Vcoding_category_table, coding_category_big5,
9380         intern ("coding-category-big5"));
9381   ASET (Vcoding_category_table, coding_category_ccl,
9382         intern ("coding-category-ccl"));
9383   ASET (Vcoding_category_table, coding_category_emacs_mule,
9384         intern ("coding-category-emacs-mule"));
9385   /* Followings are NOT target of code detection.  */
9386   ASET (Vcoding_category_table, coding_category_raw_text,
9387         intern ("coding-category-raw-text"));
9388   ASET (Vcoding_category_table, coding_category_undecided,
9389         intern ("coding-category-undecided"));
9390
9391   DEFSYM (Qinsufficient_source, "insufficient-source");
9392   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9393   DEFSYM (Qinvalid_source, "invalid-source");
9394   DEFSYM (Qinterrupted, "interrupted");
9395   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9396   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9397
9398   defsubr (&Scoding_system_p);
9399   defsubr (&Sread_coding_system);
9400   defsubr (&Sread_non_nil_coding_system);
9401   defsubr (&Scheck_coding_system);
9402   defsubr (&Sdetect_coding_region);
9403   defsubr (&Sdetect_coding_string);
9404   defsubr (&Sfind_coding_systems_region_internal);
9405   defsubr (&Sunencodable_char_position);
9406   defsubr (&Scheck_coding_systems_region);
9407   defsubr (&Sdecode_coding_region);
9408   defsubr (&Sencode_coding_region);
9409   defsubr (&Sdecode_coding_string);
9410   defsubr (&Sencode_coding_string);
9411   defsubr (&Sdecode_sjis_char);
9412   defsubr (&Sencode_sjis_char);
9413   defsubr (&Sdecode_big5_char);
9414   defsubr (&Sencode_big5_char);
9415   defsubr (&Sset_terminal_coding_system_internal);
9416   defsubr (&Sset_safe_terminal_coding_system_internal);
9417   defsubr (&Sterminal_coding_system);
9418   defsubr (&Sset_keyboard_coding_system_internal);
9419   defsubr (&Skeyboard_coding_system);
9420   defsubr (&Sfind_operation_coding_system);
9421   defsubr (&Sset_coding_system_priority);
9422   defsubr (&Sdefine_coding_system_internal);
9423   defsubr (&Sdefine_coding_system_alias);
9424   defsubr (&Scoding_system_put);
9425   defsubr (&Scoding_system_base);
9426   defsubr (&Scoding_system_plist);
9427   defsubr (&Scoding_system_aliases);
9428   defsubr (&Scoding_system_eol_type);
9429   defsubr (&Scoding_system_priority_list);
9430
9431   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9432                doc: /* List of coding systems.
9433
9434 Do not alter the value of this variable manually.  This variable should be
9435 updated by the functions `define-coding-system' and
9436 `define-coding-system-alias'.  */);
9437   Vcoding_system_list = Qnil;
9438
9439   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9440                doc: /* Alist of coding system names.
9441 Each element is one element list of coding system name.
9442 This variable is given to `completing-read' as TABLE argument.
9443
9444 Do not alter the value of this variable manually.  This variable should be
9445 updated by the functions `make-coding-system' and
9446 `define-coding-system-alias'.  */);
9447   Vcoding_system_alist = Qnil;
9448
9449   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9450                doc: /* List of coding-categories (symbols) ordered by priority.
9451
9452 On detecting a coding system, Emacs tries code detection algorithms
9453 associated with each coding-category one by one in this order.  When
9454 one algorithm agrees with a byte sequence of source text, the coding
9455 system bound to the corresponding coding-category is selected.
9456
9457 Don't modify this variable directly, but use `set-coding-priority'.  */);
9458   {
9459     int i;
9460
9461     Vcoding_category_list = Qnil;
9462     for (i = coding_category_max - 1; i >= 0; i--)
9463       Vcoding_category_list
9464         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9465                  Vcoding_category_list);
9466   }
9467
9468   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9469                doc: /* Specify the coding system for read operations.
9470 It is useful to bind this variable with `let', but do not set it globally.
9471 If the value is a coding system, it is used for decoding on read operation.
9472 If not, an appropriate element is used from one of the coding system alists:
9473 There are three such tables, `file-coding-system-alist',
9474 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9475   Vcoding_system_for_read = Qnil;
9476
9477   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9478                doc: /* Specify the coding system for write operations.
9479 Programs bind this variable with `let', but you should not set it globally.
9480 If the value is a coding system, it is used for encoding of output,
9481 when writing it to a file and when sending it to a file or subprocess.
9482
9483 If this does not specify a coding system, an appropriate element
9484 is used from one of the coding system alists:
9485 There are three such tables, `file-coding-system-alist',
9486 `process-coding-system-alist', and `network-coding-system-alist'.
9487 For output to files, if the above procedure does not specify a coding system,
9488 the value of `buffer-file-coding-system' is used.  */);
9489   Vcoding_system_for_write = Qnil;
9490
9491   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9492                doc: /*
9493 Coding system used in the latest file or process I/O.  */);
9494   Vlast_coding_system_used = Qnil;
9495
9496   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9497                doc: /*
9498 Error status of the last code conversion.
9499
9500 When an error was detected in the last code conversion, this variable
9501 is set to one of the following symbols.
9502   `insufficient-source'
9503   `inconsistent-eol'
9504   `invalid-source'
9505   `interrupted'
9506   `insufficient-memory'
9507 When no error was detected, the value doesn't change.  So, to check
9508 the error status of a code conversion by this variable, you must
9509 explicitly set this variable to nil before performing code
9510 conversion.  */);
9511   Vlast_code_conversion_error = Qnil;
9512
9513   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9514                doc: /*
9515 *Non-nil means always inhibit code conversion of end-of-line format.
9516 See info node `Coding Systems' and info node `Text and Binary' concerning
9517 such conversion.  */);
9518   inhibit_eol_conversion = 0;
9519
9520   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9521                doc: /*
9522 Non-nil means process buffer inherits coding system of process output.
9523 Bind it to t if the process output is to be treated as if it were a file
9524 read from some filesystem.  */);
9525   inherit_process_coding_system = 0;
9526
9527   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9528                doc: /*
9529 Alist to decide a coding system to use for a file I/O operation.
9530 The format is ((PATTERN . VAL) ...),
9531 where PATTERN is a regular expression matching a file name,
9532 VAL is a coding system, a cons of coding systems, or a function symbol.
9533 If VAL is a coding system, it is used for both decoding and encoding
9534 the file contents.
9535 If VAL is a cons of coding systems, the car part is used for decoding,
9536 and the cdr part is used for encoding.
9537 If VAL is a function symbol, the function must return a coding system
9538 or a cons of coding systems which are used as above.  The function gets
9539 the arguments with which `find-operation-coding-systems' was called.
9540
9541 See also the function `find-operation-coding-system'
9542 and the variable `auto-coding-alist'.  */);
9543   Vfile_coding_system_alist = Qnil;
9544
9545   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9546                doc: /*
9547 Alist to decide a coding system to use for a process I/O operation.
9548 The format is ((PATTERN . VAL) ...),
9549 where PATTERN is a regular expression matching a program name,
9550 VAL is a coding system, a cons of coding systems, or a function symbol.
9551 If VAL is a coding system, it is used for both decoding what received
9552 from the program and encoding what sent to the program.
9553 If VAL is a cons of coding systems, the car part is used for decoding,
9554 and the cdr part is used for encoding.
9555 If VAL is a function symbol, the function must return a coding system
9556 or a cons of coding systems which are used as above.
9557
9558 See also the function `find-operation-coding-system'.  */);
9559   Vprocess_coding_system_alist = Qnil;
9560
9561   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9562                doc: /*
9563 Alist to decide a coding system to use for a network I/O operation.
9564 The format is ((PATTERN . VAL) ...),
9565 where PATTERN is a regular expression matching a network service name
9566 or is a port number to connect to,
9567 VAL is a coding system, a cons of coding systems, or a function symbol.
9568 If VAL is a coding system, it is used for both decoding what received
9569 from the network stream and encoding what sent to the network stream.
9570 If VAL is a cons of coding systems, the car part is used for decoding,
9571 and the cdr part is used for encoding.
9572 If VAL is a function symbol, the function must return a coding system
9573 or a cons of coding systems which are used as above.
9574
9575 See also the function `find-operation-coding-system'.  */);
9576   Vnetwork_coding_system_alist = Qnil;
9577
9578   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9579                doc: /* Coding system to use with system messages.
9580 Also used for decoding keyboard input on X Window system.  */);
9581   Vlocale_coding_system = Qnil;
9582
9583   /* The eol mnemonics are reset in startup.el system-dependently.  */
9584   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9585                doc: /*
9586 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9587   eol_mnemonic_unix = build_string (":");
9588
9589   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9590                doc: /*
9591 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9592   eol_mnemonic_dos = build_string ("\\");
9593
9594   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9595                doc: /*
9596 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9597   eol_mnemonic_mac = build_string ("/");
9598
9599   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9600                doc: /*
9601 *String displayed in mode line when end-of-line format is not yet determined.  */);
9602   eol_mnemonic_undecided = build_string (":");
9603
9604   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9605                doc: /*
9606 *Non-nil enables character translation while encoding and decoding.  */);
9607   Venable_character_translation = Qt;
9608
9609   DEFVAR_LISP ("standard-translation-table-for-decode",
9610                &Vstandard_translation_table_for_decode,
9611                doc: /* Table for translating characters while decoding.  */);
9612   Vstandard_translation_table_for_decode = Qnil;
9613
9614   DEFVAR_LISP ("standard-translation-table-for-encode",
9615                &Vstandard_translation_table_for_encode,
9616                doc: /* Table for translating characters while encoding.  */);
9617   Vstandard_translation_table_for_encode = Qnil;
9618
9619   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9620                doc: /* Alist of charsets vs revision numbers.
9621 While encoding, if a charset (car part of an element) is found,
9622 designate it with the escape sequence identifying revision (cdr part
9623 of the element).  */);
9624   Vcharset_revision_table = Qnil;
9625
9626   DEFVAR_LISP ("default-process-coding-system",
9627                &Vdefault_process_coding_system,
9628                doc: /* Cons of coding systems used for process I/O by default.
9629 The car part is used for decoding a process output,
9630 the cdr part is used for encoding a text to be sent to a process.  */);
9631   Vdefault_process_coding_system = Qnil;
9632
9633   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9634                doc: /*
9635 Table of extra Latin codes in the range 128..159 (inclusive).
9636 This is a vector of length 256.
9637 If Nth element is non-nil, the existence of code N in a file
9638 \(or output of subprocess) doesn't prevent it to be detected as
9639 a coding system of ISO 2022 variant which has a flag
9640 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9641 or reading output of a subprocess.
9642 Only 128th through 159th elements has a meaning.  */);
9643   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9644
9645   DEFVAR_LISP ("select-safe-coding-system-function",
9646                &Vselect_safe_coding_system_function,
9647                doc: /*
9648 Function to call to select safe coding system for encoding a text.
9649
9650 If set, this function is called to force a user to select a proper
9651 coding system which can encode the text in the case that a default
9652 coding system used in each operation can't encode the text.
9653
9654 The default value is `select-safe-coding-system' (which see).  */);
9655   Vselect_safe_coding_system_function = Qnil;
9656
9657   DEFVAR_BOOL ("coding-system-require-warning",
9658                &coding_system_require_warning,
9659                doc: /* Internal use only.
9660 If non-nil, on writing a file, `select-safe-coding-system-function' is
9661 called even if `coding-system-for-write' is non-nil.  The command
9662 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9663   coding_system_require_warning = 0;
9664
9665
9666   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9667                &inhibit_iso_escape_detection,
9668                doc: /*
9669 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9670
9671 By default, on reading a file, Emacs tries to detect how the text is
9672 encoded.  This code detection is sensitive to escape sequences.  If
9673 the sequence is valid as ISO2022, the code is determined as one of
9674 the ISO2022 encodings, and the file is decoded by the corresponding
9675 coding system (e.g. `iso-2022-7bit').
9676
9677 However, there may be a case that you want to read escape sequences in
9678 a file as is.  In such a case, you can set this variable to non-nil.
9679 Then, as the code detection ignores any escape sequences, no file is
9680 detected as encoded in some ISO2022 encoding.  The result is that all
9681 escape sequences become visible in a buffer.
9682
9683 The default value is nil, and it is strongly recommended not to change
9684 it.  That is because many Emacs Lisp source files that contain
9685 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9686 in Emacs's distribution, and they won't be decoded correctly on
9687 reading if you suppress escape sequence detection.
9688
9689 The other way to read escape sequences in a file without decoding is
9690 to explicitly specify some coding system that doesn't use ISO2022's
9691 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9692   inhibit_iso_escape_detection = 0;
9693
9694   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9695                doc: /* Char table for translating self-inserting characters.
9696 This is applied to the result of input methods, not their input.  See also
9697 `keyboard-translate-table'.  */);
9698     Vtranslation_table_for_input = Qnil;
9699
9700   {
9701     Lisp_Object args[coding_arg_max];
9702     Lisp_Object plist[16];
9703     int i;
9704
9705     for (i = 0; i < coding_arg_max; i++)
9706       args[i] = Qnil;
9707
9708     plist[0] = intern (":name");
9709     plist[1] = args[coding_arg_name] = Qno_conversion;
9710     plist[2] = intern (":mnemonic");
9711     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9712     plist[4] = intern (":coding-type");
9713     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9714     plist[6] = intern (":ascii-compatible-p");
9715     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9716     plist[8] = intern (":default-char");
9717     plist[9] = args[coding_arg_default_char] = make_number (0);
9718     plist[10] = intern (":for-unibyte");
9719     plist[11] = args[coding_arg_for_unibyte] = Qt;
9720     plist[12] = intern (":docstring");
9721     plist[13] = build_string ("Do no conversion.\n\
9722 \n\
9723 When you visit a file with this coding, the file is read into a\n\
9724 unibyte buffer as is, thus each byte of a file is treated as a\n\
9725 character.");
9726     plist[14] = intern (":eol-type");
9727     plist[15] = args[coding_arg_eol_type] = Qunix;
9728     args[coding_arg_plist] = Flist (16, plist);
9729     Fdefine_coding_system_internal (coding_arg_max, args);
9730
9731     plist[1] = args[coding_arg_name] = Qundecided;
9732     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9733     plist[5] = args[coding_arg_coding_type] = Qundecided;
9734     /* This is already set.
9735        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9736     plist[8] = intern (":charset-list");
9737     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9738     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9739     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9740     plist[15] = args[coding_arg_eol_type] = Qnil;
9741     args[coding_arg_plist] = Flist (16, plist);
9742     Fdefine_coding_system_internal (coding_arg_max, args);
9743   }
9744
9745   setup_coding_system (Qno_conversion, &keyboard_coding);
9746   setup_coding_system (Qundecided, &terminal_coding);
9747   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9748
9749   {
9750     int i;
9751
9752     for (i = 0; i < coding_category_max; i++)
9753       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9754   }
9755 #if defined (MSDOS) || defined (WINDOWSNT)
9756   system_eol_type = Qdos;
9757 #else
9758   system_eol_type = Qunix;
9759 #endif
9760   staticpro (&system_eol_type);
9761 }
9762
9763 char *
9764 emacs_strerror (error_number)
9765      int error_number;
9766 {
9767   char *str;
9768
9769   synchronize_system_messages_locale ();
9770   str = strerror (error_number);
9771
9772   if (! NILP (Vlocale_coding_system))
9773     {
9774       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9775                                                       Vlocale_coding_system,
9776                                                       0);
9777       str = (char *) SDATA (dec);
9778     }
9779
9780   return str;
9781 }
9782
9783 #endif /* emacs */
9784
9785 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9786    (do not change this comment) */