src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 #ifdef emacs
 344
 345 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 346
 347 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 348
 349 /* Coding system emacs-mule and raw-text are for converting only
 350    end-of-line format.  */
 351 Lisp_Object Qemacs_mule, Qraw_text;
 352 Lisp_Object Qutf_8_emacs;
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding-system for reading files and receiving data from process.  */
 357 Lisp_Object Vcoding_system_for_read;
 358 /* Coding-system for writing files and sending data to process.  */
 359 Lisp_Object Vcoding_system_for_write;
 360 /* Coding-system actually used in the latest I/O.  */
 361 Lisp_Object Vlast_coding_system_used;
 362 /* Set to non-nil when an error is detected while code conversion.  */
 363 Lisp_Object Vlast_code_conversion_error;
 364 /* A vector of length 256 which contains information about special
 365    Latin codes (especially for dealing with Microsoft codes).  */
 366 Lisp_Object Vlatin_extra_code_table;
 367
 368 /* Flag to inhibit code conversion of end-of-line format.  */
 369 int inhibit_eol_conversion;
 370
 371 /* Flag to inhibit ISO2022 escape sequence detection.  */
 372 int inhibit_iso_escape_detection;
 373
 374 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 375 int inherit_process_coding_system;
 376
 377 /* Coding system to be used to encode text for terminal display.  */
 378 struct coding_system terminal_coding;
 379
 380 /* Coding system to be used to encode text for terminal display when
 381    terminal coding system is nil.  */
 382 struct coding_system safe_terminal_coding;
 383
 384 /* Coding system of what is sent from terminal keyboard.  */
 385 struct coding_system keyboard_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)  \
 430   ((charset_id <= (coding)->max_charset_id      \
 431     ? (coding)->safe_charsets[charset_id]       \
 432     : -1))
 433
 434
 435 #define CODING_ISO_FLAGS(coding)        \
 436   ((coding)->spec.iso_2022.flags)
 437 #define CODING_ISO_DESIGNATION(coding, reg)     \
 438   ((coding)->spec.iso_2022.current_designation[reg])
 439 #define CODING_ISO_INVOCATION(coding, plane)    \
 440   ((coding)->spec.iso_2022.current_invocation[plane])
 441 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 442   ((coding)->spec.iso_2022.single_shifting)
 443 #define CODING_ISO_BOL(coding)  \
 444   ((coding)->spec.iso_2022.bol)
 445 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 446   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 447
 448 /* Control characters of ISO2022.  */
 449                         /* code */      /* function */
 450 #define ISO_CODE_LF     0x0A            /* line-feed */
 451 #define ISO_CODE_CR     0x0D            /* carriage-return */
 452 #define ISO_CODE_SO     0x0E            /* shift-out */
 453 #define ISO_CODE_SI     0x0F            /* shift-in */
 454 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 455 #define ISO_CODE_ESC    0x1B            /* escape */
 456 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 457 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 458 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 459
 460 /* All code (1-byte) of ISO2022 is classified into one of the
 461    followings.  */
 462 enum iso_code_class_type
 463   {
 464     ISO_control_0,              /* Control codes in the range
 465                                    0x00..0x1F and 0x7F, except for the
 466                                    following 5 codes.  */
 467     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 468     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 469     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 470     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 471     ISO_control_1,              /* Control codes in the range
 472                                    0x80..0x9F, except for the
 473                                    following 3 codes.  */
 474     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 475     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 476     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 477     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 478     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 479     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 480     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 481   };
 482
 483 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 484     `iso-flags' attribute of an iso2022 coding system.  */
 485
 486 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 487    instead of the correct short-form sequence (e.g. ESC $ A).  */
 488 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 489
 490 /* If set, reset graphic planes and registers at end-of-line to the
 491    initial state.  */
 492 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 493
 494 /* If set, reset graphic planes and registers before any control
 495    characters to the initial state.  */
 496 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 497
 498 /* If set, encode by 7-bit environment.  */
 499 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 500
 501 /* If set, use locking-shift function.  */
 502 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 503
 504 /* If set, use single-shift function.  Overwrite
 505    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 506 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 507
 508 /* If set, use designation escape sequence.  */
 509 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 510
 511 /* If set, produce revision number sequence.  */
 512 #define CODING_ISO_FLAG_REVISION        0x0080
 513
 514 /* If set, produce ISO6429's direction specifying sequence.  */
 515 #define CODING_ISO_FLAG_DIRECTION       0x0100
 516
 517 /* If set, assume designation states are reset at beginning of line on
 518    output.  */
 519 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 520
 521 /* If set, designation sequence should be placed at beginning of line
 522    on output.  */
 523 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 524
 525 /* If set, do not encode unsafe charactes on output.  */
 526 #define CODING_ISO_FLAG_SAFE            0x0800
 527
 528 /* If set, extra latin codes (128..159) are accepted as a valid code
 529    on input.  */
 530 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 531
 532 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 533
 534 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 535
 536 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 537
 538 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 539
 540 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 541
 542 /* A character to be produced on output if encoding of the original
 543    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 544 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 545
 546
 547 /* UTF-16 section */
 548 #define CODING_UTF_16_BOM(coding)       \
 549   ((coding)->spec.utf_16.bom)
 550
 551 #define CODING_UTF_16_ENDIAN(coding)    \
 552   ((coding)->spec.utf_16.endian)
 553
 554 #define CODING_UTF_16_SURROGATE(coding) \
 555   ((coding)->spec.utf_16.surrogate)
 556
 557
 558 /* CCL section */
 559 #define CODING_CCL_DECODER(coding)      \
 560   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 561 #define CODING_CCL_ENCODER(coding)      \
 562   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 563 #define CODING_CCL_VALIDS(coding)                                          \
 564   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 565
 566 /* Index for each coding category in `coding_categories' */
 567
 568 enum coding_category
 569   {
 570     coding_category_iso_7,
 571     coding_category_iso_7_tight,
 572     coding_category_iso_8_1,
 573     coding_category_iso_8_2,
 574     coding_category_iso_7_else,
 575     coding_category_iso_8_else,
 576     coding_category_utf_8,
 577     coding_category_utf_16_auto,
 578     coding_category_utf_16_be,
 579     coding_category_utf_16_le,
 580     coding_category_utf_16_be_nosig,
 581     coding_category_utf_16_le_nosig,
 582     coding_category_charset,
 583     coding_category_sjis,
 584     coding_category_big5,
 585     coding_category_ccl,
 586     coding_category_emacs_mule,
 587     /* All above are targets of code detection.  */
 588     coding_category_raw_text,
 589     coding_category_undecided,
 590     coding_category_max
 591   };
 592
 593 /* Definitions of flag bits used in detect_coding_XXXX.  */
 594 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 595 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 596 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 597 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 598 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 599 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 600 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 601 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 602 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 603 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 604 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 605 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 606 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 607 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 608 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 609 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 610 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 611 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 612
 613 /* This value is returned if detect_coding_mask () find nothing other
 614    than ASCII characters.  */
 615 #define CATEGORY_MASK_ANY               \
 616   (CATEGORY_MASK_ISO_7                  \
 617    | CATEGORY_MASK_ISO_7_TIGHT          \
 618    | CATEGORY_MASK_ISO_8_1              \
 619    | CATEGORY_MASK_ISO_8_2              \
 620    | CATEGORY_MASK_ISO_7_ELSE           \
 621    | CATEGORY_MASK_ISO_8_ELSE           \
 622    | CATEGORY_MASK_UTF_8                \
 623    | CATEGORY_MASK_UTF_16_BE            \
 624    | CATEGORY_MASK_UTF_16_LE            \
 625    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 626    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 627    | CATEGORY_MASK_CHARSET              \
 628    | CATEGORY_MASK_SJIS                 \
 629    | CATEGORY_MASK_BIG5                 \
 630    | CATEGORY_MASK_CCL                  \
 631    | CATEGORY_MASK_EMACS_MULE)
 632
 633
 634 #define CATEGORY_MASK_ISO_7BIT \
 635   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 636
 637 #define CATEGORY_MASK_ISO_8BIT \
 638   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 639
 640 #define CATEGORY_MASK_ISO_ELSE \
 641   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO_ESCAPE        \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_7_ELSE           \
 647    | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO       \
 650   (  CATEGORY_MASK_ISO_7BIT     \
 651      | CATEGORY_MASK_ISO_8BIT   \
 652      | CATEGORY_MASK_ISO_ELSE)
 653
 654 #define CATEGORY_MASK_UTF_16            \
 655   (CATEGORY_MASK_UTF_16_BE              \
 656    | CATEGORY_MASK_UTF_16_LE            \
 657    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 658    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 659
 660
 661 /* List of symbols `coding-category-xxx' ordered by priority.  This
 662    variable is exposed to Emacs Lisp.  */
 663 static Lisp_Object Vcoding_category_list;
 664
 665 /* Table of coding categories (Lisp symbols).  This variable is for
 666    internal use oly.  */
 667 static Lisp_Object Vcoding_category_table;
 668
 669 /* Table of coding-categories ordered by priority.  */
 670 static enum coding_category coding_priorities[coding_category_max];
 671
 672 /* Nth element is a coding context for the coding system bound to the
 673    Nth coding category.  */
 674 static struct coding_system coding_categories[coding_category_max];
 675
 676 /*** Commonly used macros and functions ***/
 677
 678 #ifndef min
 679 #define min(a, b) ((a) < (b) ? (a) : (b))
 680 #endif
 681 #ifndef max
 682 #define max(a, b) ((a) > (b) ? (a) : (b))
 683 #endif
 684
 685 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 686   do {                                                  \
 687     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 688     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 689   } while (0)
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to `no_more_source'.  If multibytep is
 695    nonzero, and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725
 726 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 727   do {                                                  \
 728     c = *src++;                                         \
 729     if (multibytep && (c & 0x80))                       \
 730       {                                                 \
 731         if ((c & 0xFE) == 0xC0)                         \
 732           c = ((c & 1) << 6) | *src++;                  \
 733         else                                            \
 734           {                                             \
 735             src--;                                      \
 736             c = - string_char (src, &src, NULL);        \
 737             record_conversion_result                    \
 738               (coding, CODING_RESULT_INVALID_SRC);      \
 739           }                                             \
 740       }                                                 \
 741     consumed_chars++;                                   \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  The caller should
 747    assure that C is 0..127, and declare and set the variable `dst'
 748    appropriately in advance.
 749 */
 750
 751
 752 #define EMIT_ONE_ASCII_BYTE(c)  \
 753   do {                          \
 754     produced_chars++;           \
 755     *dst++ = (c);               \
 756   } while (0)
 757
 758
 759 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 760
 761 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 762   do {                                  \
 763     produced_chars += 2;                \
 764     *dst++ = (c1), *dst++ = (c2);       \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 770    nonzero, store in an appropriate multibyte from.  The caller should
 771    declare and set the variables `dst' and `multibytep' appropriately
 772    in advance.  */
 773
 774 #define EMIT_ONE_BYTE(c)                \
 775   do {                                  \
 776     produced_chars++;                   \
 777     if (multibytep)                     \
 778       {                                 \
 779         int ch = (c);                   \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       *dst++ = (c);                     \
 786   } while (0)
 787
 788
 789 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 790
 791 #define EMIT_TWO_BYTES(c1, c2)          \
 792   do {                                  \
 793     produced_chars += 2;                \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch;                         \
 797                                         \
 798         ch = (c1);                      \
 799         if (ch >= 0x80)                 \
 800           ch = BYTE8_TO_CHAR (ch);      \
 801         CHAR_STRING_ADVANCE (ch, dst);  \
 802         ch = (c2);                      \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       {                                 \
 809         *dst++ = (c1);                  \
 810         *dst++ = (c2);                  \
 811       }                                 \
 812   } while (0)
 813
 814
 815 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 816   do {                                  \
 817     EMIT_ONE_BYTE (c1);                 \
 818     EMIT_TWO_BYTES (c2, c3);            \
 819   } while (0)
 820
 821
 822 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 823   do {                                          \
 824     EMIT_TWO_BYTES (c1, c2);                    \
 825     EMIT_TWO_BYTES (c3, c4);                    \
 826   } while (0)
 827
 828
 829 /* Prototypes for static functions.  */
 830 static void record_conversion_result P_ ((struct coding_system *coding,
 831                                           enum coding_result_code result));
 832 static int detect_coding_utf_8 P_ ((struct coding_system *,
 833                                     struct coding_detection_info *info));
 834 static void decode_coding_utf_8 P_ ((struct coding_system *));
 835 static int encode_coding_utf_8 P_ ((struct coding_system *));
 836
 837 static int detect_coding_utf_16 P_ ((struct coding_system *,
 838                                      struct coding_detection_info *info));
 839 static void decode_coding_utf_16 P_ ((struct coding_system *));
 840 static int encode_coding_utf_16 P_ ((struct coding_system *));
 841
 842 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 843                                        struct coding_detection_info *info));
 844 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 845 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 846
 847 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 848                                          struct coding_detection_info *info));
 849 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 850 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 851
 852 static int detect_coding_sjis P_ ((struct coding_system *,
 853                                    struct coding_detection_info *info));
 854 static void decode_coding_sjis P_ ((struct coding_system *));
 855 static int encode_coding_sjis P_ ((struct coding_system *));
 856
 857 static int detect_coding_big5 P_ ((struct coding_system *,
 858                                    struct coding_detection_info *info));
 859 static void decode_coding_big5 P_ ((struct coding_system *));
 860 static int encode_coding_big5 P_ ((struct coding_system *));
 861
 862 static int detect_coding_ccl P_ ((struct coding_system *,
 863                                   struct coding_detection_info *info));
 864 static void decode_coding_ccl P_ ((struct coding_system *));
 865 static int encode_coding_ccl P_ ((struct coding_system *));
 866
 867 static void decode_coding_raw_text P_ ((struct coding_system *));
 868 static int encode_coding_raw_text P_ ((struct coding_system *));
 869
 870 static void coding_set_source P_ ((struct coding_system *));
 871 static void coding_set_destination P_ ((struct coding_system *));
 872 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 873 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 874                                             EMACS_INT));
 875 static unsigned char *alloc_destination P_ ((struct coding_system *,
 876                                              EMACS_INT, unsigned char *));
 877 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 878 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 879                                                      int *, int *,
 880                                                      unsigned char *));
 881 static int detect_eol P_ ((const unsigned char *,
 882                            EMACS_INT, enum coding_category));
 883 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 884 static void decode_eol P_ ((struct coding_system *));
 885 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 886 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 887                                         int, int *, int *));
 888 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 889 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 890                                             EMACS_INT));
 891 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 892                                         EMACS_INT));
 893 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 894 static int decode_coding P_ ((struct coding_system *));
 895 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 896                                                       struct coding_system *,
 897                                                       int *, EMACS_INT *));
 898 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 899                                                   struct coding_system *,
 900                                                   int *, EMACS_INT *));
 901 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 902 static int encode_coding P_ ((struct coding_system *));
 903 static Lisp_Object make_conversion_work_buffer P_ ((int));
 904 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 905 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 906 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 907
 908 static void
 909 record_conversion_result (struct coding_system *coding,
 910                           enum coding_result_code result)
 911 {
 912   coding->result = result;
 913   switch (result)
 914     {
 915     case CODING_RESULT_INSUFFICIENT_SRC:
 916       Vlast_code_conversion_error = Qinsufficient_source;
 917       break;
 918     case CODING_RESULT_INCONSISTENT_EOL:
 919       Vlast_code_conversion_error = Qinconsistent_eol;
 920       break;
 921     case CODING_RESULT_INVALID_SRC:
 922       Vlast_code_conversion_error = Qinvalid_source;
 923       break;
 924     case CODING_RESULT_INTERRUPT:
 925       Vlast_code_conversion_error = Qinterrupted;
 926       break;
 927     case CODING_RESULT_INSUFFICIENT_MEM:
 928       Vlast_code_conversion_error = Qinsufficient_memory;
 929       break;
 930     default:
 931       Vlast_code_conversion_error = intern ("Unknown error");
 932     }
 933 }
 934
 935 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 936   do {                                                                       \
 937     charset_map_loaded = 0;                                                  \
 938     c = DECODE_CHAR (charset, code);                                         \
 939     if (charset_map_loaded)                                                  \
 940       {                                                                      \
 941         const unsigned char *orig = coding->source;                          \
 942         EMACS_INT offset;                                                    \
 943                                                                              \
 944         coding_set_source (coding);                                          \
 945         offset = coding->source - orig;                                      \
 946         src += offset;                                                       \
 947         src_base += offset;                                                  \
 948         src_end += offset;                                                   \
 949       }                                                                      \
 950   } while (0)
 951
 952
 953 #define ASSURE_DESTINATION(bytes)                               \
 954   do {                                                          \
 955     if (dst + (bytes) >= dst_end)                               \
 956       {                                                         \
 957         int more_bytes = charbuf_end - charbuf + (bytes);       \
 958                                                                 \
 959         dst = alloc_destination (coding, more_bytes, dst);      \
 960         dst_end = coding->destination + coding->dst_bytes;      \
 961       }                                                         \
 962   } while (0)
 963
 964
 965
 966 static void
 967 coding_set_source (coding)
 968      struct coding_system *coding;
 969 {
 970   if (BUFFERP (coding->src_object))
 971     {
 972       struct buffer *buf = XBUFFER (coding->src_object);
 973
 974       if (coding->src_pos < 0)
 975         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 976       else
 977         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 978     }
 979   else if (STRINGP (coding->src_object))
 980     {
 981       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 982     }
 983   else
 984     /* Otherwise, the source is C string and is never relocated
 985        automatically.  Thus we don't have to update anything.  */
 986     ;
 987 }
 988
 989 static void
 990 coding_set_destination (coding)
 991      struct coding_system *coding;
 992 {
 993   if (BUFFERP (coding->dst_object))
 994     {
 995       if (coding->src_pos < 0)
 996         {
 997           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 998           coding->dst_bytes = (GAP_END_ADDR
 999                                - (coding->src_bytes - coding->consumed)
1000                                - coding->destination);
1001         }
1002       else
1003         {
1004           /* We are sure that coding->dst_pos_byte is before the gap
1005              of the buffer. */
1006           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1007                                  + coding->dst_pos_byte - 1);
1008           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1009                                - coding->destination);
1010         }
1011     }
1012   else
1013     /* Otherwise, the destination is C string and is never relocated
1014        automatically.  Thus we don't have to update anything.  */
1015     ;
1016 }
1017
1018
1019 static void
1020 coding_alloc_by_realloc (coding, bytes)
1021      struct coding_system *coding;
1022      EMACS_INT bytes;
1023 {
1024   coding->destination = (unsigned char *) xrealloc (coding->destination,
1025                                                     coding->dst_bytes + bytes);
1026   coding->dst_bytes += bytes;
1027 }
1028
1029 static void
1030 coding_alloc_by_making_gap (coding, bytes)
1031      struct coding_system *coding;
1032      EMACS_INT bytes;
1033 {
1034   if (BUFFERP (coding->dst_object)
1035       && EQ (coding->src_object, coding->dst_object))
1036     {
1037       EMACS_INT add = coding->src_bytes - coding->consumed;
1038
1039       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042     }
1043   else
1044     {
1045       Lisp_Object this_buffer;
1046
1047       this_buffer = Fcurrent_buffer ();
1048       set_buffer_internal (XBUFFER (coding->dst_object));
1049       make_gap (bytes);
1050       set_buffer_internal (XBUFFER (this_buffer));
1051     }
1052 }
1053
1054
1055 static unsigned char *
1056 alloc_destination (coding, nbytes, dst)
1057      struct coding_system *coding;
1058      EMACS_INT nbytes;
1059      unsigned char *dst;
1060 {
1061   EMACS_INT offset = dst - coding->destination;
1062
1063   if (BUFFERP (coding->dst_object))
1064     coding_alloc_by_making_gap (coding, nbytes);
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1068   coding_set_destination (coding);
1069   dst = coding->destination + offset;
1070   return dst;
1071 }
1072
1073 /** Macros for annotations.  */
1074
1075 /* Maximum length of annotation data (sum of annotations for
1076    composition and charset).  */
1077 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1091    METHOD is one of enum composition_method.
1092    Optionnal COMPOSITION-COMPONENTS are characters and composition
1093    rules.
1094
1095    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1096    follows.  */
1097
1098 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1099   do {                                                  \
1100     *(buf)++ = -(len);                                  \
1101     *(buf)++ = (mask);                                  \
1102     *(buf)++ = (nchars);                                \
1103     coding->annotated = 1;                              \
1104   } while (0);
1105
1106 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1107   do {                                                                      \
1108     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1109     *buf++ = method;                                                        \
1110   } while (0)
1111
1112
1113 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1114   do {                                                                  \
1115     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1116     *buf++ = id;                                                        \
1117   } while (0)
1118
1119 \f
1120 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1121
1122
1123
1124 \f
1125 /*** 3. UTF-8 ***/
1126
1127 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1128    Check if a text is encoded in UTF-8.  If it is, return 1, else
1129    return 0.  */
1130
1131 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1132 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1133 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1134 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1135 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1136 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1137
1138 static int
1139 detect_coding_utf_8 (coding, detect_info)
1140      struct coding_system *coding;
1141      struct coding_detection_info *detect_info;
1142 {
1143   const unsigned char *src = coding->source, *src_base;
1144   const unsigned char *src_end = coding->source + coding->src_bytes;
1145   int multibytep = coding->src_multibyte;
1146   int consumed_chars = 0;
1147   int found = 0;
1148
1149   detect_info->checked |= CATEGORY_MASK_UTF_8;
1150   /* A coding system of this category is always ASCII compatible.  */
1151   src += coding->head_ascii;
1152
1153   while (1)
1154     {
1155       int c, c1, c2, c3, c4;
1156
1157       src_base = src;
1158       ONE_MORE_BYTE (c);
1159       if (c < 0 || UTF_8_1_OCTET_P (c))
1160         continue;
1161       ONE_MORE_BYTE (c1);
1162       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1163         break;
1164       if (UTF_8_2_OCTET_LEADING_P (c))
1165         {
1166           found = CATEGORY_MASK_UTF_8;
1167           continue;
1168         }
1169       ONE_MORE_BYTE (c2);
1170       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1171         break;
1172       if (UTF_8_3_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c3);
1178       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1179         break;
1180       if (UTF_8_4_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c4);
1186       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1187         break;
1188       if (UTF_8_5_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       break;
1194     }
1195   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196   return 0;
1197
1198  no_more_source:
1199   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1200     {
1201       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202       return 0;
1203     }
1204   detect_info->found |= found;
1205   return 1;
1206 }
1207
1208
1209 static void
1210 decode_coding_utf_8 (coding)
1211      struct coding_system *coding;
1212 {
1213   const unsigned char *src = coding->source + coding->consumed;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   const unsigned char *src_base;
1216   int *charbuf = coding->charbuf + coding->charbuf_used;
1217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1218   int consumed_chars = 0, consumed_chars_base;
1219   int multibytep = coding->src_multibyte;
1220   Lisp_Object attr, charset_list;
1221
1222   CODING_GET_INFO (coding, attr, charset_list);
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4, c5;
1227
1228       src_base = src;
1229       consumed_chars_base = consumed_chars;
1230
1231       if (charbuf >= charbuf_end)
1232         break;
1233
1234       ONE_MORE_BYTE (c1);
1235       if (c1 < 0)
1236         {
1237           c = - c1;
1238         }
1239       else if (UTF_8_1_OCTET_P(c1))
1240         {
1241           c = c1;
1242         }
1243       else
1244         {
1245           ONE_MORE_BYTE (c2);
1246           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1247             goto invalid_code;
1248           if (UTF_8_2_OCTET_LEADING_P (c1))
1249             {
1250               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1251               /* Reject overlong sequences here and below.  Encoders
1252                  producing them are incorrect, they can be misleading,
1253                  and they mess up read/write invariance.  */
1254               if (c < 128)
1255                 goto invalid_code;
1256             }
1257           else
1258             {
1259               ONE_MORE_BYTE (c3);
1260               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1261                 goto invalid_code;
1262               if (UTF_8_3_OCTET_LEADING_P (c1))
1263                 {
1264                   c = (((c1 & 0xF) << 12)
1265                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1266                   if (c < 0x800
1267                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1268                     goto invalid_code;
1269                 }
1270               else
1271                 {
1272                   ONE_MORE_BYTE (c4);
1273                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274                     goto invalid_code;
1275                   if (UTF_8_4_OCTET_LEADING_P (c1))
1276                     {
1277                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1278                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1279                     if (c < 0x10000)
1280                       goto invalid_code;
1281                     }
1282                   else
1283                     {
1284                       ONE_MORE_BYTE (c5);
1285                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1286                         goto invalid_code;
1287                       if (UTF_8_5_OCTET_LEADING_P (c1))
1288                         {
1289                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1290                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1291                                | (c5 & 0x3F));
1292                           if ((c > MAX_CHAR) || (c < 0x200000))
1293                             goto invalid_code;
1294                         }
1295                       else
1296                         goto invalid_code;
1297                     }
1298                 }
1299             }
1300         }
1301
1302       *charbuf++ = c;
1303       continue;
1304
1305     invalid_code:
1306       src = src_base;
1307       consumed_chars = consumed_chars_base;
1308       ONE_MORE_BYTE (c);
1309       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1310       coding->errors++;
1311     }
1312
1313  no_more_source:
1314   coding->consumed_char += consumed_chars_base;
1315   coding->consumed = src_base - coding->source;
1316   coding->charbuf_used = charbuf - coding->charbuf;
1317 }
1318
1319
1320 static int
1321 encode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   int multibytep = coding->dst_multibyte;
1325   int *charbuf = coding->charbuf;
1326   int *charbuf_end = charbuf + coding->charbuf_used;
1327   unsigned char *dst = coding->destination + coding->produced;
1328   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1329   int produced_chars = 0;
1330   int c;
1331
1332   if (multibytep)
1333     {
1334       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1335
1336       while (charbuf < charbuf_end)
1337         {
1338           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1339
1340           ASSURE_DESTINATION (safe_room);
1341           c = *charbuf++;
1342           if (CHAR_BYTE8_P (c))
1343             {
1344               c = CHAR_TO_BYTE8 (c);
1345               EMIT_ONE_BYTE (c);
1346             }
1347           else
1348             {
1349               CHAR_STRING_ADVANCE (c, pend);
1350               for (p = str; p < pend; p++)
1351                 EMIT_ONE_BYTE (*p);
1352             }
1353         }
1354     }
1355   else
1356     {
1357       int safe_room = MAX_MULTIBYTE_LENGTH;
1358
1359       while (charbuf < charbuf_end)
1360         {
1361           ASSURE_DESTINATION (safe_room);
1362           c = *charbuf++;
1363           if (CHAR_BYTE8_P (c))
1364             *dst++ = CHAR_TO_BYTE8 (c);
1365           else
1366             dst += CHAR_STRING (c, dst);
1367           produced_chars++;
1368         }
1369     }
1370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1371   coding->produced_char += produced_chars;
1372   coding->produced = dst - coding->destination;
1373   return 0;
1374 }
1375
1376
1377 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1378    Check if a text is encoded in one of UTF-16 based coding systems.
1379    If it is, return 1, else return 0.  */
1380
1381 #define UTF_16_HIGH_SURROGATE_P(val) \
1382   (((val) & 0xFC00) == 0xD800)
1383
1384 #define UTF_16_LOW_SURROGATE_P(val) \
1385   (((val) & 0xFC00) == 0xDC00)
1386
1387 #define UTF_16_INVALID_P(val)   \
1388   (((val) == 0xFFFE)            \
1389    || ((val) == 0xFFFF)         \
1390    || UTF_16_LOW_SURROGATE_P (val))
1391
1392
1393 static int
1394 detect_coding_utf_16 (coding, detect_info)
1395      struct coding_system *coding;
1396      struct coding_detection_info *detect_info;
1397 {
1398   const unsigned char *src = coding->source, *src_base = src;
1399   const unsigned char *src_end = coding->source + coding->src_bytes;
1400   int multibytep = coding->src_multibyte;
1401   int consumed_chars = 0;
1402   int c1, c2;
1403
1404   detect_info->checked |= CATEGORY_MASK_UTF_16;
1405   if (coding->mode & CODING_MODE_LAST_BLOCK
1406       && (coding->src_chars & 1))
1407     {
1408       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1409       return 0;
1410     }
1411
1412   ONE_MORE_BYTE (c1);
1413   ONE_MORE_BYTE (c2);
1414   if ((c1 == 0xFF) && (c2 == 0xFE))
1415     {
1416       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1417                              | CATEGORY_MASK_UTF_16_AUTO);
1418       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1419                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1420                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1421     }
1422   else if ((c1 == 0xFE) && (c2 == 0xFF))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if (c1 >= 0 && c2 >= 0)
1431     {
1432       detect_info->rejected
1433         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1434     }
1435  no_more_source:
1436   return 1;
1437 }
1438
1439 static void
1440 decode_coding_utf_16 (coding)
1441      struct coding_system *coding;
1442 {
1443   const unsigned char *src = coding->source + coding->consumed;
1444   const unsigned char *src_end = coding->source + coding->src_bytes;
1445   const unsigned char *src_base;
1446   int *charbuf = coding->charbuf + coding->charbuf_used;
1447   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1448   int consumed_chars = 0, consumed_chars_base;
1449   int multibytep = coding->src_multibyte;
1450   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1451   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1452   int surrogate = CODING_UTF_16_SURROGATE (coding);
1453   Lisp_Object attr, charset_list;
1454
1455   CODING_GET_INFO (coding, attr, charset_list);
1456
1457   if (bom == utf_16_with_bom)
1458     {
1459       int c, c1, c2;
1460
1461       src_base = src;
1462       ONE_MORE_BYTE (c1);
1463       ONE_MORE_BYTE (c2);
1464       c = (c1 << 8) | c2;
1465
1466       if (endian == utf_16_big_endian
1467           ? c != 0xFEFF : c != 0xFFFE)
1468         {
1469           /* The first two bytes are not BOM.  Treat them as bytes
1470              for a normal character.  */
1471           src = src_base;
1472           coding->errors++;
1473         }
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476   else if (bom == utf_16_detect_bom)
1477     {
1478       /* We have already tried to detect BOM and failed in
1479          detect_coding.  */
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482
1483   while (1)
1484     {
1485       int c, c1, c2;
1486
1487       src_base = src;
1488       consumed_chars_base = consumed_chars;
1489
1490       if (charbuf + 2 >= charbuf_end)
1491         break;
1492
1493       ONE_MORE_BYTE (c1);
1494       if (c1 < 0)
1495         {
1496           *charbuf++ = -c1;
1497           continue;
1498         }
1499       ONE_MORE_BYTE (c2);
1500       if (c2 < 0)
1501         {
1502           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1503           *charbuf++ = -c2;
1504           continue;
1505         }
1506       c = (endian == utf_16_big_endian
1507            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1508       if (surrogate)
1509         {
1510           if (! UTF_16_LOW_SURROGATE_P (c))
1511             {
1512               if (endian == utf_16_big_endian)
1513                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1514               else
1515                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1516               *charbuf++ = c1;
1517               *charbuf++ = c2;
1518               coding->errors++;
1519               if (UTF_16_HIGH_SURROGATE_P (c))
1520                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1521               else
1522                 *charbuf++ = c;
1523             }
1524           else
1525             {
1526               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1527               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1528               *charbuf++ = 0x10000 + c;
1529             }
1530         }
1531       else
1532         {
1533           if (UTF_16_HIGH_SURROGATE_P (c))
1534             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1535           else
1536             *charbuf++ = c;
1537         }
1538     }
1539
1540  no_more_source:
1541   coding->consumed_char += consumed_chars_base;
1542   coding->consumed = src_base - coding->source;
1543   coding->charbuf_used = charbuf - coding->charbuf;
1544 }
1545
1546 static int
1547 encode_coding_utf_16 (coding)
1548      struct coding_system *coding;
1549 {
1550   int multibytep = coding->dst_multibyte;
1551   int *charbuf = coding->charbuf;
1552   int *charbuf_end = charbuf + coding->charbuf_used;
1553   unsigned char *dst = coding->destination + coding->produced;
1554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1555   int safe_room = 8;
1556   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1557   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1558   int produced_chars = 0;
1559   Lisp_Object attrs, charset_list;
1560   int c;
1561
1562   CODING_GET_INFO (coding, attrs, charset_list);
1563
1564   if (bom != utf_16_without_bom)
1565     {
1566       ASSURE_DESTINATION (safe_room);
1567       if (big_endian)
1568         EMIT_TWO_BYTES (0xFE, 0xFF);
1569       else
1570         EMIT_TWO_BYTES (0xFF, 0xFE);
1571       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1572     }
1573
1574   while (charbuf < charbuf_end)
1575     {
1576       ASSURE_DESTINATION (safe_room);
1577       c = *charbuf++;
1578       if (c >= MAX_UNICODE_CHAR)
1579         c = coding->default_char;
1580
1581       if (c < 0x10000)
1582         {
1583           if (big_endian)
1584             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1585           else
1586             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1587         }
1588       else
1589         {
1590           int c1, c2;
1591
1592           c -= 0x10000;
1593           c1 = (c >> 10) + 0xD800;
1594           c2 = (c & 0x3FF) + 0xDC00;
1595           if (big_endian)
1596             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1597           else
1598             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1599         }
1600     }
1601   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1602   coding->produced = dst - coding->destination;
1603   coding->produced_char += produced_chars;
1604   return 0;
1605 }
1606
1607 \f
1608 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1609
1610 /* Emacs' internal format for representation of multiple character
1611    sets is a kind of multi-byte encoding, i.e. characters are
1612    represented by variable-length sequences of one-byte codes.
1613
1614    ASCII characters and control characters (e.g. `tab', `newline') are
1615    represented by one-byte sequences which are their ASCII codes, in
1616    the range 0x00 through 0x7F.
1617
1618    8-bit characters of the range 0x80..0x9F are represented by
1619    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1620    code + 0x20).
1621
1622    8-bit characters of the range 0xA0..0xFF are represented by
1623    one-byte sequences which are their 8-bit code.
1624
1625    The other characters are represented by a sequence of `base
1626    leading-code', optional `extended leading-code', and one or two
1627    `position-code's.  The length of the sequence is determined by the
1628    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1629    whereas extended leading-code and position-code take the range 0xA0
1630    through 0xFF.  See `charset.h' for more details about leading-code
1631    and position-code.
1632
1633    --- CODE RANGE of Emacs' internal format ---
1634    character set        range
1635    -------------        -----
1636    ascii                0x00..0x7F
1637    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1638    eight-bit-graphic    0xA0..0xBF
1639    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1640    ---------------------------------------------
1641
1642    As this is the internal character representation, the format is
1643    usually not used externally (i.e. in a file or in a data sent to a
1644    process).  But, it is possible to have a text externally in this
1645    format (i.e. by encoding by the coding system `emacs-mule').
1646
1647    In that case, a sequence of one-byte codes has a slightly different
1648    form.
1649
1650    At first, all characters in eight-bit-control are represented by
1651    one-byte sequences which are their 8-bit code.
1652
1653    Next, character composition data are represented by the byte
1654    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1655    where,
1656         METHOD is 0xF0 plus one of composition method (enum
1657         composition_method),
1658
1659         BYTES is 0xA0 plus a byte length of this composition data,
1660
1661         CHARS is 0x20 plus a number of characters composed by this
1662         data,
1663
1664         COMPONENTs are characters of multibye form or composition
1665         rules encoded by two-byte of ASCII codes.
1666
1667    In addition, for backward compatibility, the following formats are
1668    also recognized as composition data on decoding.
1669
1670    0x80 MSEQ ...
1671    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1672
1673    Here,
1674         MSEQ is a multibyte form but in these special format:
1675           ASCII: 0xA0 ASCII_CODE+0x80,
1676           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1677         RULE is a one byte code of the range 0xA0..0xF0 that
1678         represents a composition rule.
1679   */
1680
1681 char emacs_mule_bytes[256];
1682
1683 int
1684 emacs_mule_char (coding, src, nbytes, nchars, id)
1685      struct coding_system *coding;
1686      const unsigned char *src;
1687      int *nbytes, *nchars, *id;
1688 {
1689   const unsigned char *src_end = coding->source + coding->src_bytes;
1690   const unsigned char *src_base = src;
1691   int multibytep = coding->src_multibyte;
1692   struct charset *charset;
1693   unsigned code;
1694   int c;
1695   int consumed_chars = 0;
1696
1697   ONE_MORE_BYTE (c);
1698   if (c < 0)
1699     {
1700       c = -c;
1701       charset = emacs_mule_charset[0];
1702     }
1703   else
1704     {
1705       if (c >= 0xA0)
1706         {
1707           /* Old style component character of a compostion.  */
1708           if (c == 0xA0)
1709             {
1710               ONE_MORE_BYTE (c);
1711               c -= 0x80;
1712             }
1713           else
1714             c -= 0x20;
1715         }
1716
1717       switch (emacs_mule_bytes[c])
1718         {
1719         case 2:
1720           if (! (charset = emacs_mule_charset[c]))
1721             goto invalid_code;
1722           ONE_MORE_BYTE (c);
1723           if (c < 0xA0)
1724             goto invalid_code;
1725           code = c & 0x7F;
1726           break;
1727
1728         case 3:
1729           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1730               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1731             {
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1734                 goto invalid_code;
1735               ONE_MORE_BYTE (c);
1736               if (c < 0xA0)
1737                 goto invalid_code;
1738               code = c & 0x7F;
1739             }
1740           else
1741             {
1742               if (! (charset = emacs_mule_charset[c]))
1743                 goto invalid_code;
1744               ONE_MORE_BYTE (c);
1745               if (c < 0xA0)
1746                 goto invalid_code;
1747               code = (c & 0x7F) << 8;
1748               ONE_MORE_BYTE (c);
1749               if (c < 0xA0)
1750                 goto invalid_code;
1751               code |= c & 0x7F;
1752             }
1753           break;
1754
1755         case 4:
1756           ONE_MORE_BYTE (c);
1757           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1758             goto invalid_code;
1759           ONE_MORE_BYTE (c);
1760           if (c < 0xA0)
1761             goto invalid_code;
1762           code = (c & 0x7F) << 8;
1763           ONE_MORE_BYTE (c);
1764           if (c < 0xA0)
1765             goto invalid_code;
1766           code |= c & 0x7F;
1767           break;
1768
1769         case 1:
1770           code = c;
1771           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1772                                      ? charset_ascii : charset_eight_bit);
1773           break;
1774
1775         default:
1776           abort ();
1777         }
1778       c = DECODE_CHAR (charset, code);
1779       if (c < 0)
1780         goto invalid_code;
1781     }
1782   *nbytes = src - src_base;
1783   *nchars = consumed_chars;
1784   if (id)
1785     *id = charset->id;
1786   return c;
1787
1788  no_more_source:
1789   return -2;
1790
1791  invalid_code:
1792   return -1;
1793 }
1794
1795
1796 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1797    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1798    else return 0.  */
1799
1800 static int
1801 detect_coding_emacs_mule (coding, detect_info)
1802      struct coding_system *coding;
1803      struct coding_detection_info *detect_info;
1804 {
1805   const unsigned char *src = coding->source, *src_base;
1806   const unsigned char *src_end = coding->source + coding->src_bytes;
1807   int multibytep = coding->src_multibyte;
1808   int consumed_chars = 0;
1809   int c;
1810   int found = 0;
1811
1812   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1813   /* A coding system of this category is always ASCII compatible.  */
1814   src += coding->head_ascii;
1815
1816   while (1)
1817     {
1818       src_base = src;
1819       ONE_MORE_BYTE (c);
1820       if (c < 0)
1821         continue;
1822       if (c == 0x80)
1823         {
1824           /* Perhaps the start of composite character.  We simple skip
1825              it because analyzing it is too heavy for detecting.  But,
1826              at least, we check that the composite character
1827              constitues of more than 4 bytes.  */
1828           const unsigned char *src_base;
1829
1830         repeat:
1831           src_base = src;
1832           do
1833             {
1834               ONE_MORE_BYTE (c);
1835             }
1836           while (c >= 0xA0);
1837
1838           if (src - src_base <= 4)
1839             break;
1840           found = CATEGORY_MASK_EMACS_MULE;
1841           if (c == 0x80)
1842             goto repeat;
1843         }
1844
1845       if (c < 0x80)
1846         {
1847           if (c < 0x20
1848               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1849             break;
1850         }
1851       else
1852         {
1853           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1854
1855           while (more_bytes > 0)
1856             {
1857               ONE_MORE_BYTE (c);
1858               if (c < 0xA0)
1859                 {
1860                   src--;        /* Unread the last byte.  */
1861                   break;
1862                 }
1863               more_bytes--;
1864             }
1865           if (more_bytes != 0)
1866             break;
1867           found = CATEGORY_MASK_EMACS_MULE;
1868         }
1869     }
1870   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1871   return 0;
1872
1873  no_more_source:
1874   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1875     {
1876       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1877       return 0;
1878     }
1879   detect_info->found |= found;
1880   return 1;
1881 }
1882
1883
1884 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1885
1886 /* Decode a character represented as a component of composition
1887    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1888    update SRC to the head of next character (or an encoded composition
1889    rule).  If SRC doesn't points a composition component, set C to -1.
1890    If SRC points an invalid byte sequence, global exit by a return
1891    value 0.  */
1892
1893 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1894   if (1)                                                        \
1895     {                                                           \
1896       int c;                                                    \
1897       int nbytes, nchars;                                       \
1898                                                                 \
1899       if (src == src_end)                                       \
1900         break;                                                  \
1901       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1902       if (c < 0)                                                \
1903         {                                                       \
1904           if (c == -2)                                          \
1905             break;                                              \
1906           goto invalid_code;                                    \
1907         }                                                       \
1908       *buf++ = c;                                               \
1909       src += nbytes;                                            \
1910       consumed_chars += nchars;                                 \
1911     }                                                           \
1912   else
1913
1914
1915 /* Decode a composition rule represented as a component of composition
1916    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1917    and increment BUF.  If SRC points an invalid byte sequence, set C
1918    to -1.  */
1919
1920 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1921   do {                                                  \
1922     int c, gref, nref;                                  \
1923                                                         \
1924     if (src >= src_end)                                 \
1925       goto invalid_code;                                \
1926     ONE_MORE_BYTE_NO_CHECK (c);                         \
1927     c -= 0xA0;                                          \
1928     if (c < 0 || c >= 81)                               \
1929       goto invalid_code;                                \
1930                                                         \
1931     gref = c / 9, nref = c % 9;                         \
1932     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1933   } while (0)
1934
1935
1936 /* Decode a composition rule represented as a component of composition
1937    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1938    and increment BUF.  If SRC points an invalid byte sequence, set C
1939    to -1.  */
1940
1941 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1942   do {                                                  \
1943     int gref, nref;                                     \
1944                                                         \
1945     if (src + 1>= src_end)                              \
1946       goto invalid_code;                                \
1947     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1948     gref -= 0x20;                                       \
1949     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1950     nref -= 0x20;                                       \
1951     if (gref < 0 || gref >= 81                          \
1952         || nref < 0 || nref >= 81)                      \
1953       goto invalid_code;                                \
1954     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1955   } while (0)
1956
1957
1958 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1959   do {                                                                  \
1960     /* Emacs 21 style format.  The first three bytes at SRC are         \
1961        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1962        the byte length of this composition information, CHARS is the    \
1963        number of characters composed by this composition.  */           \
1964     enum composition_method method = c - 0xF2;                          \
1965     int *charbuf_base = charbuf;                                        \
1966     int consumed_chars_limit;                                           \
1967     int nbytes, nchars;                                                 \
1968                                                                         \
1969     ONE_MORE_BYTE (c);                                                  \
1970     if (c < 0)                                                          \
1971       goto invalid_code;                                                \
1972     nbytes = c - 0xA0;                                                  \
1973     if (nbytes < 3)                                                     \
1974       goto invalid_code;                                                \
1975     ONE_MORE_BYTE (c);                                                  \
1976     if (c < 0)                                                          \
1977       goto invalid_code;                                                \
1978     nchars = c - 0xA0;                                                  \
1979     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1980     consumed_chars_limit = consumed_chars_base + nbytes;                \
1981     if (method != COMPOSITION_RELATIVE)                                 \
1982       {                                                                 \
1983         int i = 0;                                                      \
1984         while (consumed_chars < consumed_chars_limit)                   \
1985           {                                                             \
1986             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1987               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1988             else                                                        \
1989               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1990             i++;                                                        \
1991           }                                                             \
1992         if (consumed_chars < consumed_chars_limit)                      \
1993           goto invalid_code;                                            \
1994         charbuf_base[0] -= i;                                           \
1995       }                                                                 \
1996   } while (0)
1997
1998
1999 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
2000   do {                                                          \
2001     /* Emacs 20 style format for relative composition.  */      \
2002     /* Store multibyte form of characters to be composed.  */   \
2003     enum composition_method method = COMPOSITION_RELATIVE;      \
2004     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2005     int *buf = components;                                      \
2006     int i, j;                                                   \
2007                                                                 \
2008     src = src_base;                                             \
2009     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
2010     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2011       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
2012     if (i < 2)                                                  \
2013       goto invalid_code;                                        \
2014     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2015     for (j = 0; j < i; j++)                                     \
2016       *charbuf++ = components[j];                               \
2017   } while (0)
2018
2019
2020 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2021   do {                                                          \
2022     /* Emacs 20 style format for rule-base composition.  */     \
2023     /* Store multibyte form of characters to be composed.  */   \
2024     enum composition_method method = COMPOSITION_WITH_RULE;     \
2025     int *charbuf_base = charbuf;                                \
2026     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2027     int *buf = components;                                      \
2028     int i, j;                                                   \
2029                                                                 \
2030     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2031     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2032       {                                                         \
2033         if (*src < 0xA0)                                        \
2034           break;                                                \
2035         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2036         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2037       }                                                         \
2038     if (i <= 1 || (buf - components) % 2 == 0)                  \
2039       goto invalid_code;                                        \
2040     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2041       goto no_more_source;                                      \
2042     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2043     i = i * 2 - 1;                                              \
2044     for (j = 0; j < i; j++)                                     \
2045       *charbuf++ = components[j];                               \
2046     charbuf_base[0] -= i;                                       \
2047     for (j = 0; j < i; j += 2)                                  \
2048       *charbuf++ = components[j];                               \
2049   } while (0)
2050
2051
2052 static void
2053 decode_coding_emacs_mule (coding)
2054      struct coding_system *coding;
2055 {
2056   const unsigned char *src = coding->source + coding->consumed;
2057   const unsigned char *src_end = coding->source + coding->src_bytes;
2058   const unsigned char *src_base;
2059   int *charbuf = coding->charbuf + coding->charbuf_used;
2060   int *charbuf_end
2061     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2062   int consumed_chars = 0, consumed_chars_base;
2063   int multibytep = coding->src_multibyte;
2064   Lisp_Object attrs, charset_list;
2065   int char_offset = coding->produced_char;
2066   int last_offset = char_offset;
2067   int last_id = charset_ascii;
2068
2069   CODING_GET_INFO (coding, attrs, charset_list);
2070
2071   while (1)
2072     {
2073       int c;
2074
2075       src_base = src;
2076       consumed_chars_base = consumed_chars;
2077
2078       if (charbuf >= charbuf_end)
2079         break;
2080
2081       ONE_MORE_BYTE (c);
2082       if (c < 0)
2083         {
2084           *charbuf++ = -c;
2085           char_offset++;
2086         }
2087       else if (c < 0x80)
2088         {
2089           *charbuf++ = c;
2090           char_offset++;
2091         }
2092       else if (c == 0x80)
2093         {
2094           ONE_MORE_BYTE (c);
2095           if (c < 0)
2096             goto invalid_code;
2097           if (c - 0xF2 >= COMPOSITION_RELATIVE
2098               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2099             DECODE_EMACS_MULE_21_COMPOSITION (c);
2100           else if (c < 0xC0)
2101             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2102           else if (c == 0xFF)
2103             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2104           else
2105             goto invalid_code;
2106         }
2107       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2108         {
2109           int nbytes, nchars;
2110           int id;
2111
2112           src = src_base;
2113           consumed_chars = consumed_chars_base;
2114           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2115           if (c < 0)
2116             {
2117               if (c == -2)
2118                 break;
2119               goto invalid_code;
2120             }
2121           if (last_id != id)
2122             {
2123               if (last_id != charset_ascii)
2124                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2125               last_id = id;
2126               last_offset = char_offset;
2127             }
2128           *charbuf++ = c;
2129           src += nbytes;
2130           consumed_chars += nchars;
2131           char_offset++;
2132         }
2133       else
2134         goto invalid_code;
2135       continue;
2136
2137     invalid_code:
2138       src = src_base;
2139       consumed_chars = consumed_chars_base;
2140       ONE_MORE_BYTE (c);
2141       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2142       char_offset++;
2143       coding->errors++;
2144     }
2145
2146  no_more_source:
2147   if (last_id != charset_ascii)
2148     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2149   coding->consumed_char += consumed_chars_base;
2150   coding->consumed = src_base - coding->source;
2151   coding->charbuf_used = charbuf - coding->charbuf;
2152 }
2153
2154
2155 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2156   do {                                          \
2157     if (id < 0xA0)                              \
2158       codes[0] = id, codes[1] = 0;              \
2159     else if (id < 0xE0)                         \
2160       codes[0] = 0x9A, codes[1] = id;           \
2161     else if (id < 0xF0)                         \
2162       codes[0] = 0x9B, codes[1] = id;           \
2163     else if (id < 0xF5)                         \
2164       codes[0] = 0x9C, codes[1] = id;           \
2165     else                                        \
2166       codes[0] = 0x9D, codes[1] = id;           \
2167   } while (0);
2168
2169
2170 static int
2171 encode_coding_emacs_mule (coding)
2172      struct coding_system *coding;
2173 {
2174   int multibytep = coding->dst_multibyte;
2175   int *charbuf = coding->charbuf;
2176   int *charbuf_end = charbuf + coding->charbuf_used;
2177   unsigned char *dst = coding->destination + coding->produced;
2178   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2179   int safe_room = 8;
2180   int produced_chars = 0;
2181   Lisp_Object attrs, charset_list;
2182   int c;
2183   int preferred_charset_id = -1;
2184
2185   CODING_GET_INFO (coding, attrs, charset_list);
2186   if (! EQ (charset_list, Vemacs_mule_charset_list))
2187     {
2188       CODING_ATTR_CHARSET_LIST (attrs)
2189         = charset_list = Vemacs_mule_charset_list;
2190     }
2191
2192   while (charbuf < charbuf_end)
2193     {
2194       ASSURE_DESTINATION (safe_room);
2195       c = *charbuf++;
2196
2197       if (c < 0)
2198         {
2199           /* Handle an annotation.  */
2200           switch (*charbuf)
2201             {
2202             case CODING_ANNOTATE_COMPOSITION_MASK:
2203               /* Not yet implemented.  */
2204               break;
2205             case CODING_ANNOTATE_CHARSET_MASK:
2206               preferred_charset_id = charbuf[3];
2207               if (preferred_charset_id >= 0
2208                   && NILP (Fmemq (make_number (preferred_charset_id),
2209                                   charset_list)))
2210                 preferred_charset_id = -1;
2211               break;
2212             default:
2213               abort ();
2214             }
2215           charbuf += -c - 1;
2216           continue;
2217         }
2218
2219       if (ASCII_CHAR_P (c))
2220         EMIT_ONE_ASCII_BYTE (c);
2221       else if (CHAR_BYTE8_P (c))
2222         {
2223           c = CHAR_TO_BYTE8 (c);
2224           EMIT_ONE_BYTE (c);
2225         }
2226       else
2227         {
2228           struct charset *charset;
2229           unsigned code;
2230           int dimension;
2231           int emacs_mule_id;
2232           unsigned char leading_codes[2];
2233
2234           if (preferred_charset_id >= 0)
2235             {
2236               charset = CHARSET_FROM_ID (preferred_charset_id);
2237               if (! CHAR_CHARSET_P (c, charset))
2238                 charset = char_charset (c, charset_list, NULL);
2239             }
2240           else
2241             charset = char_charset (c, charset_list, &code);
2242           if (! charset)
2243             {
2244               c = coding->default_char;
2245               if (ASCII_CHAR_P (c))
2246                 {
2247                   EMIT_ONE_ASCII_BYTE (c);
2248                   continue;
2249                 }
2250               charset = char_charset (c, charset_list, &code);
2251             }
2252           dimension = CHARSET_DIMENSION (charset);
2253           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2254           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2255           EMIT_ONE_BYTE (leading_codes[0]);
2256           if (leading_codes[1])
2257             EMIT_ONE_BYTE (leading_codes[1]);
2258           if (dimension == 1)
2259             EMIT_ONE_BYTE (code | 0x80);
2260           else
2261             {
2262               code |= 0x8080;
2263               EMIT_ONE_BYTE (code >> 8);
2264               EMIT_ONE_BYTE (code & 0xFF);
2265             }
2266         }
2267     }
2268   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2269   coding->produced_char += produced_chars;
2270   coding->produced = dst - coding->destination;
2271   return 0;
2272 }
2273
2274 \f
2275 /*** 7. ISO2022 handlers ***/
2276
2277 /* The following note describes the coding system ISO2022 briefly.
2278    Since the intention of this note is to help understand the
2279    functions in this file, some parts are NOT ACCURATE or are OVERLY
2280    SIMPLIFIED.  For thorough understanding, please refer to the
2281    original document of ISO2022.  This is equivalent to the standard
2282    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2283
2284    ISO2022 provides many mechanisms to encode several character sets
2285    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2286    is encoded using bytes less than 128.  This may make the encoded
2287    text a little bit longer, but the text passes more easily through
2288    several types of gateway, some of which strip off the MSB (Most
2289    Significant Bit).
2290
2291    There are two kinds of character sets: control character sets and
2292    graphic character sets.  The former contain control characters such
2293    as `newline' and `escape' to provide control functions (control
2294    functions are also provided by escape sequences).  The latter
2295    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2296    two control character sets and many graphic character sets.
2297
2298    Graphic character sets are classified into one of the following
2299    four classes, according to the number of bytes (DIMENSION) and
2300    number of characters in one dimension (CHARS) of the set:
2301    - DIMENSION1_CHARS94
2302    - DIMENSION1_CHARS96
2303    - DIMENSION2_CHARS94
2304    - DIMENSION2_CHARS96
2305
2306    In addition, each character set is assigned an identification tag,
2307    unique for each set, called the "final character" (denoted as <F>
2308    hereafter).  The <F> of each character set is decided by ECMA(*)
2309    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2310    (0x30..0x3F are for private use only).
2311
2312    Note (*): ECMA = European Computer Manufacturers Association
2313
2314    Here are examples of graphic character sets [NAME(<F>)]:
2315         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2316         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2317         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2318         o DIMENSION2_CHARS96 -- none for the moment
2319
2320    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2321         C0 [0x00..0x1F] -- control character plane 0
2322         GL [0x20..0x7F] -- graphic character plane 0
2323         C1 [0x80..0x9F] -- control character plane 1
2324         GR [0xA0..0xFF] -- graphic character plane 1
2325
2326    A control character set is directly designated and invoked to C0 or
2327    C1 by an escape sequence.  The most common case is that:
2328    - ISO646's  control character set is designated/invoked to C0, and
2329    - ISO6429's control character set is designated/invoked to C1,
2330    and usually these designations/invocations are omitted in encoded
2331    text.  In a 7-bit environment, only C0 can be used, and a control
2332    character for C1 is encoded by an appropriate escape sequence to
2333    fit into the environment.  All control characters for C1 are
2334    defined to have corresponding escape sequences.
2335
2336    A graphic character set is at first designated to one of four
2337    graphic registers (G0 through G3), then these graphic registers are
2338    invoked to GL or GR.  These designations and invocations can be
2339    done independently.  The most common case is that G0 is invoked to
2340    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2341    these invocations and designations are omitted in encoded text.
2342    In a 7-bit environment, only GL can be used.
2343
2344    When a graphic character set of CHARS94 is invoked to GL, codes
2345    0x20 and 0x7F of the GL area work as control characters SPACE and
2346    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2347    be used.
2348
2349    There are two ways of invocation: locking-shift and single-shift.
2350    With locking-shift, the invocation lasts until the next different
2351    invocation, whereas with single-shift, the invocation affects the
2352    following character only and doesn't affect the locking-shift
2353    state.  Invocations are done by the following control characters or
2354    escape sequences:
2355
2356    ----------------------------------------------------------------------
2357    abbrev  function                  cntrl escape seq   description
2358    ----------------------------------------------------------------------
2359    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2360    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2361    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2362    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2363    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2364    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2365    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2366    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2367    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2368    ----------------------------------------------------------------------
2369    (*) These are not used by any known coding system.
2370
2371    Control characters for these functions are defined by macros
2372    ISO_CODE_XXX in `coding.h'.
2373
2374    Designations are done by the following escape sequences:
2375    ----------------------------------------------------------------------
2376    escape sequence      description
2377    ----------------------------------------------------------------------
2378    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2379    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2380    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2381    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2382    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2383    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2384    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2385    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2386    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2387    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2388    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2389    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2390    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2391    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2392    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2393    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2394    ----------------------------------------------------------------------
2395
2396    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2397    of dimension 1, chars 94, and final character <F>, etc...
2398
2399    Note (*): Although these designations are not allowed in ISO2022,
2400    Emacs accepts them on decoding, and produces them on encoding
2401    CHARS96 character sets in a coding system which is characterized as
2402    7-bit environment, non-locking-shift, and non-single-shift.
2403
2404    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2405    '(' must be omitted.  We refer to this as "short-form" hereafter.
2406
2407    Now you may notice that there are a lot of ways of encoding the
2408    same multilingual text in ISO2022.  Actually, there exist many
2409    coding systems such as Compound Text (used in X11's inter client
2410    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2411    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2412    localized platforms), and all of these are variants of ISO2022.
2413
2414    In addition to the above, Emacs handles two more kinds of escape
2415    sequences: ISO6429's direction specification and Emacs' private
2416    sequence for specifying character composition.
2417
2418    ISO6429's direction specification takes the following form:
2419         o CSI ']'      -- end of the current direction
2420         o CSI '0' ']'  -- end of the current direction
2421         o CSI '1' ']'  -- start of left-to-right text
2422         o CSI '2' ']'  -- start of right-to-left text
2423    The control character CSI (0x9B: control sequence introducer) is
2424    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2425
2426    Character composition specification takes the following form:
2427         o ESC '0' -- start relative composition
2428         o ESC '1' -- end composition
2429         o ESC '2' -- start rule-base composition (*)
2430         o ESC '3' -- start relative composition with alternate chars  (**)
2431         o ESC '4' -- start rule-base composition with alternate chars  (**)
2432   Since these are not standard escape sequences of any ISO standard,
2433   the use of them with these meanings is restricted to Emacs only.
2434
2435   (*) This form is used only in Emacs 20.7 and older versions,
2436   but newer versions can safely decode it.
2437   (**) This form is used only in Emacs 21.1 and newer versions,
2438   and older versions can't decode it.
2439
2440   Here's a list of example usages of these composition escape
2441   sequences (categorized by `enum composition_method').
2442
2443   COMPOSITION_RELATIVE:
2444         ESC 0 CHAR [ CHAR ] ESC 1
2445   COMPOSITION_WITH_RULE:
2446         ESC 2 CHAR [ RULE CHAR ] ESC 1
2447   COMPOSITION_WITH_ALTCHARS:
2448         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2449   COMPOSITION_WITH_RULE_ALTCHARS:
2450         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2451
2452 enum iso_code_class_type iso_code_class[256];
2453
2454 #define SAFE_CHARSET_P(coding, id)      \
2455   ((id) <= (coding)->max_charset_id     \
2456    && (coding)->safe_charsets[id] >= 0)
2457
2458
2459 #define SHIFT_OUT_OK(category)  \
2460   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2461
2462 static void
2463 setup_iso_safe_charsets (attrs)
2464      Lisp_Object attrs;
2465 {
2466   Lisp_Object charset_list, safe_charsets;
2467   Lisp_Object request;
2468   Lisp_Object reg_usage;
2469   Lisp_Object tail;
2470   int reg94, reg96;
2471   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2472   int max_charset_id;
2473
2474   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2475   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2476       && ! EQ (charset_list, Viso_2022_charset_list))
2477     {
2478       CODING_ATTR_CHARSET_LIST (attrs)
2479         = charset_list = Viso_2022_charset_list;
2480       ASET (attrs, coding_attr_safe_charsets, Qnil);
2481     }
2482
2483   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2484     return;
2485
2486   max_charset_id = 0;
2487   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2488     {
2489       int id = XINT (XCAR (tail));
2490       if (max_charset_id < id)
2491         max_charset_id = id;
2492     }
2493
2494   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2495                                 make_number (255));
2496   request = AREF (attrs, coding_attr_iso_request);
2497   reg_usage = AREF (attrs, coding_attr_iso_usage);
2498   reg94 = XINT (XCAR (reg_usage));
2499   reg96 = XINT (XCDR (reg_usage));
2500
2501   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2502     {
2503       Lisp_Object id;
2504       Lisp_Object reg;
2505       struct charset *charset;
2506
2507       id = XCAR (tail);
2508       charset = CHARSET_FROM_ID (XINT (id));
2509       reg = Fcdr (Fassq (id, request));
2510       if (! NILP (reg))
2511         SSET (safe_charsets, XINT (id), XINT (reg));
2512       else if (charset->iso_chars_96)
2513         {
2514           if (reg96 < 4)
2515             SSET (safe_charsets, XINT (id), reg96);
2516         }
2517       else
2518         {
2519           if (reg94 < 4)
2520             SSET (safe_charsets, XINT (id), reg94);
2521         }
2522     }
2523   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2524 }
2525
2526
2527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2528    Check if a text is encoded in one of ISO-2022 based codig systems.
2529    If it is, return 1, else return 0.  */
2530
2531 static int
2532 detect_coding_iso_2022 (coding, detect_info)
2533      struct coding_system *coding;
2534      struct coding_detection_info *detect_info;
2535 {
2536   const unsigned char *src = coding->source, *src_base = src;
2537   const unsigned char *src_end = coding->source + coding->src_bytes;
2538   int multibytep = coding->src_multibyte;
2539   int single_shifting = 0;
2540   int id;
2541   int c, c1;
2542   int consumed_chars = 0;
2543   int i;
2544   int rejected = 0;
2545   int found = 0;
2546
2547   detect_info->checked |= CATEGORY_MASK_ISO;
2548
2549   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2550     {
2551       struct coding_system *this = &(coding_categories[i]);
2552       Lisp_Object attrs, val;
2553
2554       attrs = CODING_ID_ATTRS (this->id);
2555       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2556           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2557         setup_iso_safe_charsets (attrs);
2558       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2559       this->max_charset_id = SCHARS (val) - 1;
2560       this->safe_charsets = (char *) SDATA (val);
2561     }
2562
2563   /* A coding system of this category is always ASCII compatible.  */
2564   src += coding->head_ascii;
2565
2566   while (rejected != CATEGORY_MASK_ISO)
2567     {
2568       src_base = src;
2569       ONE_MORE_BYTE (c);
2570       switch (c)
2571         {
2572         case ISO_CODE_ESC:
2573           if (inhibit_iso_escape_detection)
2574             break;
2575           single_shifting = 0;
2576           ONE_MORE_BYTE (c);
2577           if (c >= '(' && c <= '/')
2578             {
2579               /* Designation sequence for a charset of dimension 1.  */
2580               ONE_MORE_BYTE (c1);
2581               if (c1 < ' ' || c1 >= 0x80
2582                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2583                 /* Invalid designation sequence.  Just ignore.  */
2584                 break;
2585             }
2586           else if (c == '$')
2587             {
2588               /* Designation sequence for a charset of dimension 2.  */
2589               ONE_MORE_BYTE (c);
2590               if (c >= '@' && c <= 'B')
2591                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2592                 id = iso_charset_table[1][0][c];
2593               else if (c >= '(' && c <= '/')
2594                 {
2595                   ONE_MORE_BYTE (c1);
2596                   if (c1 < ' ' || c1 >= 0x80
2597                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2598                     /* Invalid designation sequence.  Just ignore.  */
2599                     break;
2600                 }
2601               else
2602                 /* Invalid designation sequence.  Just ignore it.  */
2603                 break;
2604             }
2605           else if (c == 'N' || c == 'O')
2606             {
2607               /* ESC <Fe> for SS2 or SS3.  */
2608               single_shifting = 1;
2609               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2610               break;
2611             }
2612           else if (c >= '0' && c <= '4')
2613             {
2614               /* ESC <Fp> for start/end composition.  */
2615               found |= CATEGORY_MASK_ISO;
2616               break;
2617             }
2618           else
2619             {
2620               /* Invalid escape sequence.  Just ignore it.  */
2621               break;
2622             }
2623
2624           /* We found a valid designation sequence for CHARSET.  */
2625           rejected |= CATEGORY_MASK_ISO_8BIT;
2626           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2627                               id))
2628             found |= CATEGORY_MASK_ISO_7;
2629           else
2630             rejected |= CATEGORY_MASK_ISO_7;
2631           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2632                               id))
2633             found |= CATEGORY_MASK_ISO_7_TIGHT;
2634           else
2635             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2636           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2637                               id))
2638             found |= CATEGORY_MASK_ISO_7_ELSE;
2639           else
2640             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2641           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2642                               id))
2643             found |= CATEGORY_MASK_ISO_8_ELSE;
2644           else
2645             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2646           break;
2647
2648         case ISO_CODE_SO:
2649         case ISO_CODE_SI:
2650           /* Locking shift out/in.  */
2651           if (inhibit_iso_escape_detection)
2652             break;
2653           single_shifting = 0;
2654           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2655           found |= CATEGORY_MASK_ISO_ELSE;
2656           break;
2657
2658         case ISO_CODE_CSI:
2659           /* Control sequence introducer.  */
2660           single_shifting = 0;
2661           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2662           found |= CATEGORY_MASK_ISO_8_ELSE;
2663           goto check_extra_latin;
2664
2665         case ISO_CODE_SS2:
2666         case ISO_CODE_SS3:
2667           /* Single shift.   */
2668           if (inhibit_iso_escape_detection)
2669             break;
2670           single_shifting = 0;
2671           rejected |= CATEGORY_MASK_ISO_7BIT;
2672           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2673               & CODING_ISO_FLAG_SINGLE_SHIFT)
2674             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2675           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2676               & CODING_ISO_FLAG_SINGLE_SHIFT)
2677             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2678           if (single_shifting)
2679             break;
2680           goto check_extra_latin;
2681
2682         default:
2683           if (c < 0)
2684             continue;
2685           if (c < 0x80)
2686             {
2687               single_shifting = 0;
2688               break;
2689             }
2690           if (c >= 0xA0)
2691             {
2692               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2693               found |= CATEGORY_MASK_ISO_8_1;
2694               /* Check the length of succeeding codes of the range
2695                  0xA0..0FF.  If the byte length is even, we include
2696                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2697                  only when we are not single shifting.  */
2698               if (! single_shifting
2699                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2700                 {
2701                   int i = 1;
2702                   while (src < src_end)
2703                     {
2704                       ONE_MORE_BYTE (c);
2705                       if (c < 0xA0)
2706                         break;
2707                       i++;
2708                     }
2709
2710                   if (i & 1 && src < src_end)
2711                     rejected |= CATEGORY_MASK_ISO_8_2;
2712                   else
2713                     found |= CATEGORY_MASK_ISO_8_2;
2714                 }
2715               break;
2716             }
2717         check_extra_latin:
2718           single_shifting = 0;
2719           if (! VECTORP (Vlatin_extra_code_table)
2720               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2721             {
2722               rejected = CATEGORY_MASK_ISO;
2723               break;
2724             }
2725           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2726               & CODING_ISO_FLAG_LATIN_EXTRA)
2727             found |= CATEGORY_MASK_ISO_8_1;
2728           else
2729             rejected |= CATEGORY_MASK_ISO_8_1;
2730           rejected |= CATEGORY_MASK_ISO_8_2;
2731         }
2732     }
2733   detect_info->rejected |= CATEGORY_MASK_ISO;
2734   return 0;
2735
2736  no_more_source:
2737   detect_info->rejected |= rejected;
2738   detect_info->found |= (found & ~rejected);
2739   return 1;
2740 }
2741
2742
2743 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2744    escape sequence should be kept.  */
2745 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2746   do {                                                                  \
2747     int id, prev;                                                       \
2748                                                                         \
2749     if (final < '0' || final >= 128                                     \
2750         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2751         || !SAFE_CHARSET_P (coding, id))                                \
2752       {                                                                 \
2753         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2754         chars_96 = -1;                                                  \
2755         break;                                                          \
2756       }                                                                 \
2757     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2758     if (id == charset_jisx0201_roman)                                   \
2759       {                                                                 \
2760         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2761           id = charset_ascii;                                           \
2762       }                                                                 \
2763     else if (id == charset_jisx0208_1978)                               \
2764       {                                                                 \
2765         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2766           id = charset_jisx0208;                                        \
2767       }                                                                 \
2768     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2769     /* If there was an invalid designation to REG previously, and this  \
2770        designation is ASCII to REG, we should keep this designation     \
2771        sequence.  */                                                    \
2772     if (prev == -2 && id == charset_ascii)                              \
2773       chars_96 = -1;                                                    \
2774   } while (0)
2775
2776
2777 #define MAYBE_FINISH_COMPOSITION()                              \
2778   do {                                                          \
2779     int i;                                                      \
2780     if (composition_state == COMPOSING_NO)                      \
2781       break;                                                    \
2782     /* It is assured that we have enough room for producing     \
2783        characters stored in the table `components'.  */         \
2784     if (charbuf + component_idx > charbuf_end)                  \
2785       goto no_more_source;                                      \
2786     composition_state = COMPOSING_NO;                           \
2787     if (method == COMPOSITION_RELATIVE                          \
2788         || method == COMPOSITION_WITH_ALTCHARS)                 \
2789       {                                                         \
2790         for (i = 0; i < component_idx; i++)                     \
2791           *charbuf++ = components[i];                           \
2792         char_offset += component_idx;                           \
2793       }                                                         \
2794     else                                                        \
2795       {                                                         \
2796         for (i = 0; i < component_idx; i += 2)                  \
2797           *charbuf++ = components[i];                           \
2798         char_offset += (component_idx / 2) + 1;                 \
2799       }                                                         \
2800   } while (0)
2801
2802
2803 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2804    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2805    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2806    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2807    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2808   */
2809
2810 #define DECODE_COMPOSITION_START(c1)                                    \
2811   do {                                                                  \
2812     if (c1 == '0'                                                       \
2813         && composition_state == COMPOSING_COMPONENT_RULE)               \
2814       {                                                                 \
2815         component_len = component_idx;                                  \
2816         composition_state = COMPOSING_CHAR;                             \
2817       }                                                                 \
2818     else                                                                \
2819       {                                                                 \
2820         const unsigned char *p;                                         \
2821                                                                         \
2822         MAYBE_FINISH_COMPOSITION ();                                    \
2823         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2824           goto no_more_source;                                          \
2825         for (p = src; p < src_end - 1; p++)                             \
2826           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2827             break;                                                      \
2828         if (p == src_end - 1)                                           \
2829           {                                                             \
2830             /* The current composition doesn't end in the current       \
2831                source.  */                                              \
2832             record_conversion_result                                    \
2833               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
2834             goto no_more_source;                                        \
2835           }                                                             \
2836                                                                         \
2837         /* This is surely the start of a composition.  */               \
2838         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2839                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2840                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2841                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2842         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2843                              : COMPOSING_COMPONENT_CHAR);               \
2844         component_idx = component_len = 0;                              \
2845       }                                                                 \
2846   } while (0)
2847
2848
2849 /* Handle compositoin end sequence ESC 1.  */
2850
2851 #define DECODE_COMPOSITION_END()                                        \
2852   do {                                                                  \
2853     int nchars = (component_len > 0 ? component_idx - component_len     \
2854                   : method == COMPOSITION_RELATIVE ? component_idx      \
2855                   : (component_idx + 1) / 2);                           \
2856     int i;                                                              \
2857     int *saved_charbuf = charbuf;                                       \
2858                                                                         \
2859     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2860     if (method != COMPOSITION_RELATIVE)                                 \
2861       {                                                                 \
2862         if (component_len == 0)                                         \
2863           for (i = 0; i < component_idx; i++)                           \
2864             *charbuf++ = components[i];                                 \
2865         else                                                            \
2866           for (i = 0; i < component_len; i++)                           \
2867             *charbuf++ = components[i];                                 \
2868         *saved_charbuf = saved_charbuf - charbuf;                       \
2869       }                                                                 \
2870     if (method == COMPOSITION_WITH_RULE)                                \
2871       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2872         *charbuf++ = components[i];                                     \
2873     else                                                                \
2874       for (i = component_len; i < component_idx; i++, char_offset++)    \
2875         *charbuf++ = components[i];                                     \
2876     coding->annotated = 1;                                              \
2877     composition_state = COMPOSING_NO;                                   \
2878   } while (0)
2879
2880
2881 /* Decode a composition rule from the byte C1 (and maybe one more byte
2882    from SRC) and store one encoded composition rule in
2883    coding->cmp_data.  */
2884
2885 #define DECODE_COMPOSITION_RULE(c1)                                     \
2886   do {                                                                  \
2887     (c1) -= 32;                                                         \
2888     if (c1 < 81)                /* old format (before ver.21) */        \
2889       {                                                                 \
2890         int gref = (c1) / 9;                                            \
2891         int nref = (c1) % 9;                                            \
2892         if (gref == 4) gref = 10;                                       \
2893         if (nref == 4) nref = 10;                                       \
2894         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2895       }                                                                 \
2896     else if (c1 < 93)           /* new format (after ver.21) */         \
2897       {                                                                 \
2898         ONE_MORE_BYTE (c2);                                             \
2899         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2900       }                                                                 \
2901     else                                                                \
2902       c1 = 0;                                                           \
2903   } while (0)
2904
2905
2906 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2907
2908 static void
2909 decode_coding_iso_2022 (coding)
2910      struct coding_system *coding;
2911 {
2912   const unsigned char *src = coding->source + coding->consumed;
2913   const unsigned char *src_end = coding->source + coding->src_bytes;
2914   const unsigned char *src_base;
2915   int *charbuf = coding->charbuf + coding->charbuf_used;
2916   int *charbuf_end
2917     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2918   int consumed_chars = 0, consumed_chars_base;
2919   int multibytep = coding->src_multibyte;
2920   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2921   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2922   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2923   int charset_id_2, charset_id_3;
2924   struct charset *charset;
2925   int c;
2926   /* For handling composition sequence.  */
2927 #define COMPOSING_NO                    0
2928 #define COMPOSING_CHAR                  1
2929 #define COMPOSING_RULE                  2
2930 #define COMPOSING_COMPONENT_CHAR        3
2931 #define COMPOSING_COMPONENT_RULE        4
2932
2933   int composition_state = COMPOSING_NO;
2934   enum composition_method method;
2935   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2936   int component_idx;
2937   int component_len;
2938   Lisp_Object attrs, charset_list;
2939   int char_offset = coding->produced_char;
2940   int last_offset = char_offset;
2941   int last_id = charset_ascii;
2942
2943   CODING_GET_INFO (coding, attrs, charset_list);
2944   setup_iso_safe_charsets (attrs);
2945
2946   while (1)
2947     {
2948       int c1, c2;
2949
2950       src_base = src;
2951       consumed_chars_base = consumed_chars;
2952
2953       if (charbuf >= charbuf_end)
2954         break;
2955
2956       ONE_MORE_BYTE (c1);
2957       if (c1 < 0)
2958         goto invalid_code;
2959
2960       /* We produce at most one character.  */
2961       switch (iso_code_class [c1])
2962         {
2963         case ISO_0x20_or_0x7F:
2964           if (composition_state != COMPOSING_NO)
2965             {
2966               if (composition_state == COMPOSING_RULE
2967                   || composition_state == COMPOSING_COMPONENT_RULE)
2968                 {
2969                   DECODE_COMPOSITION_RULE (c1);
2970                   components[component_idx++] = c1;
2971                   composition_state--;
2972                   continue;
2973                 }
2974             }
2975           if (charset_id_0 < 0
2976               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2977             /* This is SPACE or DEL.  */
2978             charset = CHARSET_FROM_ID (charset_ascii);
2979           else
2980             charset = CHARSET_FROM_ID (charset_id_0);
2981           break;
2982
2983         case ISO_graphic_plane_0:
2984           if (composition_state != COMPOSING_NO)
2985             {
2986               if (composition_state == COMPOSING_RULE
2987                   || composition_state == COMPOSING_COMPONENT_RULE)
2988                 {
2989                   DECODE_COMPOSITION_RULE (c1);
2990                   components[component_idx++] = c1;
2991                   composition_state--;
2992                   continue;
2993                 }
2994             }
2995           if (charset_id_0 < 0)
2996             charset = CHARSET_FROM_ID (charset_ascii);
2997           else
2998             charset = CHARSET_FROM_ID (charset_id_0);
2999           break;
3000
3001         case ISO_0xA0_or_0xFF:
3002           if (charset_id_1 < 0
3003               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3004               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3005             goto invalid_code;
3006           /* This is a graphic character, we fall down ... */
3007
3008         case ISO_graphic_plane_1:
3009           if (charset_id_1 < 0)
3010             goto invalid_code;
3011           charset = CHARSET_FROM_ID (charset_id_1);
3012           break;
3013
3014         case ISO_control_0:
3015           MAYBE_FINISH_COMPOSITION ();
3016           charset = CHARSET_FROM_ID (charset_ascii);
3017           break;
3018
3019         case ISO_control_1:
3020           MAYBE_FINISH_COMPOSITION ();
3021           goto invalid_code;
3022
3023         case ISO_shift_out:
3024           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3025               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3026             goto invalid_code;
3027           CODING_ISO_INVOCATION (coding, 0) = 1;
3028           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3029           continue;
3030
3031         case ISO_shift_in:
3032           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3033             goto invalid_code;
3034           CODING_ISO_INVOCATION (coding, 0) = 0;
3035           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3036           continue;
3037
3038         case ISO_single_shift_2_7:
3039         case ISO_single_shift_2:
3040           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3041             goto invalid_code;
3042           /* SS2 is handled as an escape sequence of ESC 'N' */
3043           c1 = 'N';
3044           goto label_escape_sequence;
3045
3046         case ISO_single_shift_3:
3047           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3048             goto invalid_code;
3049           /* SS2 is handled as an escape sequence of ESC 'O' */
3050           c1 = 'O';
3051           goto label_escape_sequence;
3052
3053         case ISO_control_sequence_introducer:
3054           /* CSI is handled as an escape sequence of ESC '[' ...  */
3055           c1 = '[';
3056           goto label_escape_sequence;
3057
3058         case ISO_escape:
3059           ONE_MORE_BYTE (c1);
3060         label_escape_sequence:
3061           /* Escape sequences handled here are invocation,
3062              designation, direction specification, and character
3063              composition specification.  */
3064           switch (c1)
3065             {
3066             case '&':           /* revision of following character set */
3067               ONE_MORE_BYTE (c1);
3068               if (!(c1 >= '@' && c1 <= '~'))
3069                 goto invalid_code;
3070               ONE_MORE_BYTE (c1);
3071               if (c1 != ISO_CODE_ESC)
3072                 goto invalid_code;
3073               ONE_MORE_BYTE (c1);
3074               goto label_escape_sequence;
3075
3076             case '$':           /* designation of 2-byte character set */
3077               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3078                 goto invalid_code;
3079               {
3080                 int reg, chars96;
3081
3082                 ONE_MORE_BYTE (c1);
3083                 if (c1 >= '@' && c1 <= 'B')
3084                   {     /* designation of JISX0208.1978, GB2312.1980,
3085                            or JISX0208.1980 */
3086                     reg = 0, chars96 = 0;
3087                   }
3088                 else if (c1 >= 0x28 && c1 <= 0x2B)
3089                   { /* designation of DIMENSION2_CHARS94 character set */
3090                     reg = c1 - 0x28, chars96 = 0;
3091                     ONE_MORE_BYTE (c1);
3092                   }
3093                 else if (c1 >= 0x2C && c1 <= 0x2F)
3094                   { /* designation of DIMENSION2_CHARS96 character set */
3095                     reg = c1 - 0x2C, chars96 = 1;
3096                     ONE_MORE_BYTE (c1);
3097                   }
3098                 else
3099                   goto invalid_code;
3100                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3101                 /* We must update these variables now.  */
3102                 if (reg == 0)
3103                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3104                 else if (reg == 1)
3105                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3106                 if (chars96 < 0)
3107                   goto invalid_code;
3108               }
3109               continue;
3110
3111             case 'n':           /* invocation of locking-shift-2 */
3112               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3113                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3114                 goto invalid_code;
3115               CODING_ISO_INVOCATION (coding, 0) = 2;
3116               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3117               continue;
3118
3119             case 'o':           /* invocation of locking-shift-3 */
3120               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3121                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3122                 goto invalid_code;
3123               CODING_ISO_INVOCATION (coding, 0) = 3;
3124               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3125               continue;
3126
3127             case 'N':           /* invocation of single-shift-2 */
3128               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3129                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3130                 goto invalid_code;
3131               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3132               if (charset_id_2 < 0)
3133                 charset = CHARSET_FROM_ID (charset_ascii);
3134               else
3135                 charset = CHARSET_FROM_ID (charset_id_2);
3136               ONE_MORE_BYTE (c1);
3137               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3138                 goto invalid_code;
3139               break;
3140
3141             case 'O':           /* invocation of single-shift-3 */
3142               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3143                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3144                 goto invalid_code;
3145               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3146               if (charset_id_3 < 0)
3147                 charset = CHARSET_FROM_ID (charset_ascii);
3148               else
3149                 charset = CHARSET_FROM_ID (charset_id_3);
3150               ONE_MORE_BYTE (c1);
3151               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3152                 goto invalid_code;
3153               break;
3154
3155             case '0': case '2': case '3': case '4': /* start composition */
3156               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3157                 goto invalid_code;
3158               DECODE_COMPOSITION_START (c1);
3159               continue;
3160
3161             case '1':           /* end composition */
3162               if (composition_state == COMPOSING_NO)
3163                 goto invalid_code;
3164               DECODE_COMPOSITION_END ();
3165               continue;
3166
3167             case '[':           /* specification of direction */
3168               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3169                 goto invalid_code;
3170               /* For the moment, nested direction is not supported.
3171                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3172                  left-to-right, and nozero means right-to-left.  */
3173               ONE_MORE_BYTE (c1);
3174               switch (c1)
3175                 {
3176                 case ']':       /* end of the current direction */
3177                   coding->mode &= ~CODING_MODE_DIRECTION;
3178
3179                 case '0':       /* end of the current direction */
3180                 case '1':       /* start of left-to-right direction */
3181                   ONE_MORE_BYTE (c1);
3182                   if (c1 == ']')
3183                     coding->mode &= ~CODING_MODE_DIRECTION;
3184                   else
3185                     goto invalid_code;
3186                   break;
3187
3188                 case '2':       /* start of right-to-left direction */
3189                   ONE_MORE_BYTE (c1);
3190                   if (c1 == ']')
3191                     coding->mode |= CODING_MODE_DIRECTION;
3192                   else
3193                     goto invalid_code;
3194                   break;
3195
3196                 default:
3197                   goto invalid_code;
3198                 }
3199               continue;
3200
3201             case '%':
3202               ONE_MORE_BYTE (c1);
3203               if (c1 == '/')
3204                 {
3205                   /* CTEXT extended segment:
3206                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3207                      We keep these bytes as is for the moment.
3208                      They may be decoded by post-read-conversion.  */
3209                   int dim, M, L;
3210                   int size;
3211
3212                   ONE_MORE_BYTE (dim);
3213                   ONE_MORE_BYTE (M);
3214                   ONE_MORE_BYTE (L);
3215                   size = ((M - 128) * 128) + (L - 128);
3216                   if (charbuf + 8 + size > charbuf_end)
3217                     goto break_loop;
3218                   *charbuf++ = ISO_CODE_ESC;
3219                   *charbuf++ = '%';
3220                   *charbuf++ = '/';
3221                   *charbuf++ = dim;
3222                   *charbuf++ = BYTE8_TO_CHAR (M);
3223                   *charbuf++ = BYTE8_TO_CHAR (L);
3224                   while (size-- > 0)
3225                     {
3226                       ONE_MORE_BYTE (c1);
3227                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3228                     }
3229                 }
3230               else if (c1 == 'G')
3231                 {
3232                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3233                      ESC % G --UTF-8-BYTES-- ESC % @
3234                      We keep these bytes as is for the moment.
3235                      They may be decoded by post-read-conversion.  */
3236                   int *p = charbuf;
3237
3238                   if (p + 6 > charbuf_end)
3239                     goto break_loop;
3240                   *p++ = ISO_CODE_ESC;
3241                   *p++ = '%';
3242                   *p++ = 'G';
3243                   while (p < charbuf_end)
3244                     {
3245                       ONE_MORE_BYTE (c1);
3246                       if (c1 == ISO_CODE_ESC
3247                           && src + 1 < src_end
3248                           && src[0] == '%'
3249                           && src[1] == '@')
3250                         {
3251                           src += 2;
3252                           break;
3253                         }
3254                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3255                     }
3256                   if (p + 3 > charbuf_end)
3257                     goto break_loop;
3258                   *p++ = ISO_CODE_ESC;
3259                   *p++ = '%';
3260                   *p++ = '@';
3261                   charbuf = p;
3262                 }
3263               else
3264                 goto invalid_code;
3265               continue;
3266               break;
3267
3268             default:
3269               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3270                 goto invalid_code;
3271               {
3272                 int reg, chars96;
3273
3274                 if (c1 >= 0x28 && c1 <= 0x2B)
3275                   { /* designation of DIMENSION1_CHARS94 character set */
3276                     reg = c1 - 0x28, chars96 = 0;
3277                     ONE_MORE_BYTE (c1);
3278                   }
3279                 else if (c1 >= 0x2C && c1 <= 0x2F)
3280                   { /* designation of DIMENSION1_CHARS96 character set */
3281                     reg = c1 - 0x2C, chars96 = 1;
3282                     ONE_MORE_BYTE (c1);
3283                   }
3284                 else
3285                   goto invalid_code;
3286                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3287                 /* We must update these variables now.  */
3288                 if (reg == 0)
3289                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3290                 else if (reg == 1)
3291                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3292                 if (chars96 < 0)
3293                   goto invalid_code;
3294               }
3295               continue;
3296             }
3297         }
3298
3299       if (charset->id != charset_ascii
3300           && last_id != charset->id)
3301         {
3302           if (last_id != charset_ascii)
3303             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3304           last_id = charset->id;
3305           last_offset = char_offset;
3306         }
3307
3308       /* Now we know CHARSET and 1st position code C1 of a character.
3309          Produce a decoded character while getting 2nd position code
3310          C2 if necessary.  */
3311       c1 &= 0x7F;
3312       if (CHARSET_DIMENSION (charset) > 1)
3313         {
3314           ONE_MORE_BYTE (c2);
3315           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3316             /* C2 is not in a valid range.  */
3317             goto invalid_code;
3318           c1 = (c1 << 8) | (c2 & 0x7F);
3319           if (CHARSET_DIMENSION (charset) > 2)
3320             {
3321               ONE_MORE_BYTE (c2);
3322               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3323                 /* C2 is not in a valid range.  */
3324                 goto invalid_code;
3325               c1 = (c1 << 8) | (c2 & 0x7F);
3326             }
3327         }
3328
3329       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3330       if (c < 0)
3331         {
3332           MAYBE_FINISH_COMPOSITION ();
3333           for (; src_base < src; src_base++, char_offset++)
3334             {
3335               if (ASCII_BYTE_P (*src_base))
3336                 *charbuf++ = *src_base;
3337               else
3338                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3339             }
3340         }
3341       else if (composition_state == COMPOSING_NO)
3342         {
3343           *charbuf++ = c;
3344           char_offset++;
3345         }
3346       else
3347         {
3348           components[component_idx++] = c;
3349           if (method == COMPOSITION_WITH_RULE
3350               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3351                   && composition_state == COMPOSING_COMPONENT_CHAR))
3352             composition_state++;
3353         }
3354       continue;
3355
3356     invalid_code:
3357       MAYBE_FINISH_COMPOSITION ();
3358       src = src_base;
3359       consumed_chars = consumed_chars_base;
3360       ONE_MORE_BYTE (c);
3361       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3362       char_offset++;
3363       coding->errors++;
3364       continue;
3365
3366     break_loop:
3367       break;
3368     }
3369
3370  no_more_source:
3371   if (last_id != charset_ascii)
3372     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3373   coding->consumed_char += consumed_chars_base;
3374   coding->consumed = src_base - coding->source;
3375   coding->charbuf_used = charbuf - coding->charbuf;
3376 }
3377
3378
3379 /* ISO2022 encoding stuff.  */
3380
3381 /*
3382    It is not enough to say just "ISO2022" on encoding, we have to
3383    specify more details.  In Emacs, each coding system of ISO2022
3384    variant has the following specifications:
3385         1. Initial designation to G0 thru G3.
3386         2. Allows short-form designation?
3387         3. ASCII should be designated to G0 before control characters?
3388         4. ASCII should be designated to G0 at end of line?
3389         5. 7-bit environment or 8-bit environment?
3390         6. Use locking-shift?
3391         7. Use Single-shift?
3392    And the following two are only for Japanese:
3393         8. Use ASCII in place of JIS0201-1976-Roman?
3394         9. Use JISX0208-1983 in place of JISX0208-1978?
3395    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3396    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3397    details.
3398 */
3399
3400 /* Produce codes (escape sequence) for designating CHARSET to graphic
3401    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3402    '@', 'A', or 'B' and the coding system CODING allows, produce
3403    designation sequence of short-form.  */
3404
3405 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3406   do {                                                                  \
3407     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3408     char *intermediate_char_94 = "()*+";                                \
3409     char *intermediate_char_96 = ",-./";                                \
3410     int revision = -1;                                                  \
3411     int c;                                                              \
3412                                                                         \
3413     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3414       revision = CHARSET_ISO_REVISION (charset);                        \
3415                                                                         \
3416     if (revision >= 0)                                                  \
3417       {                                                                 \
3418         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3419         EMIT_ONE_BYTE ('@' + revision);                                 \
3420       }                                                                 \
3421     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3422     if (CHARSET_DIMENSION (charset) == 1)                               \
3423       {                                                                 \
3424         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3425           c = intermediate_char_94[reg];                                \
3426         else                                                            \
3427           c = intermediate_char_96[reg];                                \
3428         EMIT_ONE_ASCII_BYTE (c);                                        \
3429       }                                                                 \
3430     else                                                                \
3431       {                                                                 \
3432         EMIT_ONE_ASCII_BYTE ('$');                                      \
3433         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3434           {                                                             \
3435             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3436                 || reg != 0                                             \
3437                 || final_char < '@' || final_char > 'B')                \
3438               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3439           }                                                             \
3440         else                                                            \
3441           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3442       }                                                                 \
3443     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3444                                                                         \
3445     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3446   } while (0)
3447
3448
3449 /* The following two macros produce codes (control character or escape
3450    sequence) for ISO2022 single-shift functions (single-shift-2 and
3451    single-shift-3).  */
3452
3453 #define ENCODE_SINGLE_SHIFT_2                                           \
3454   do {                                                                  \
3455     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3456       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3457     else                                                                \
3458       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3459     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3460   } while (0)
3461
3462
3463 #define ENCODE_SINGLE_SHIFT_3                                           \
3464   do {                                                                  \
3465     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3466       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3467     else                                                                \
3468       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3469     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3470   } while (0)
3471
3472
3473 /* The following four macros produce codes (control character or
3474    escape sequence) for ISO2022 locking-shift functions (shift-in,
3475    shift-out, locking-shift-2, and locking-shift-3).  */
3476
3477 #define ENCODE_SHIFT_IN                                 \
3478   do {                                                  \
3479     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3480     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3481   } while (0)
3482
3483
3484 #define ENCODE_SHIFT_OUT                                \
3485   do {                                                  \
3486     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3487     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3488   } while (0)
3489
3490
3491 #define ENCODE_LOCKING_SHIFT_2                          \
3492   do {                                                  \
3493     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3494     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3495   } while (0)
3496
3497
3498 #define ENCODE_LOCKING_SHIFT_3                          \
3499   do {                                                  \
3500     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3501     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3502   } while (0)
3503
3504
3505 /* Produce codes for a DIMENSION1 character whose character set is
3506    CHARSET and whose position-code is C1.  Designation and invocation
3507    sequences are also produced in advance if necessary.  */
3508
3509 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3510   do {                                                                  \
3511     int id = CHARSET_ID (charset);                                      \
3512                                                                         \
3513     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3514         && id == charset_ascii)                                         \
3515       {                                                                 \
3516         id = charset_jisx0201_roman;                                    \
3517         charset = CHARSET_FROM_ID (id);                                 \
3518       }                                                                 \
3519                                                                         \
3520     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3521       {                                                                 \
3522         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3523           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3524         else                                                            \
3525           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3526         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3527         break;                                                          \
3528       }                                                                 \
3529     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3530       {                                                                 \
3531         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3532         break;                                                          \
3533       }                                                                 \
3534     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3535       {                                                                 \
3536         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3537         break;                                                          \
3538       }                                                                 \
3539     else                                                                \
3540       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3541          must invoke it, or, at first, designate it to some graphic     \
3542          register.  Then repeat the loop to actually produce the        \
3543          character.  */                                                 \
3544       dst = encode_invocation_designation (charset, coding, dst,        \
3545                                            &produced_chars);            \
3546   } while (1)
3547
3548
3549 /* Produce codes for a DIMENSION2 character whose character set is
3550    CHARSET and whose position-codes are C1 and C2.  Designation and
3551    invocation codes are also produced in advance if necessary.  */
3552
3553 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3554   do {                                                                  \
3555     int id = CHARSET_ID (charset);                                      \
3556                                                                         \
3557     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3558         && id == charset_jisx0208)                                      \
3559       {                                                                 \
3560         id = charset_jisx0208_1978;                                     \
3561         charset = CHARSET_FROM_ID (id);                                 \
3562       }                                                                 \
3563                                                                         \
3564     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3565       {                                                                 \
3566         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3567           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3568         else                                                            \
3569           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3570         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3571         break;                                                          \
3572       }                                                                 \
3573     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3574       {                                                                 \
3575         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3576         break;                                                          \
3577       }                                                                 \
3578     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3579       {                                                                 \
3580         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3581         break;                                                          \
3582       }                                                                 \
3583     else                                                                \
3584       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3585          must invoke it, or, at first, designate it to some graphic     \
3586          register.  Then repeat the loop to actually produce the        \
3587          character.  */                                                 \
3588       dst = encode_invocation_designation (charset, coding, dst,        \
3589                                            &produced_chars);            \
3590   } while (1)
3591
3592
3593 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3594   do {                                                                     \
3595     int code = ENCODE_CHAR ((charset),(c));                                \
3596                                                                            \
3597     if (CHARSET_DIMENSION (charset) == 1)                                  \
3598       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3599     else                                                                   \
3600       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3601   } while (0)
3602
3603
3604 /* Produce designation and invocation codes at a place pointed by DST
3605    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3606    Return new DST.  */
3607
3608 unsigned char *
3609 encode_invocation_designation (charset, coding, dst, p_nchars)
3610      struct charset *charset;
3611      struct coding_system *coding;
3612      unsigned char *dst;
3613      int *p_nchars;
3614 {
3615   int multibytep = coding->dst_multibyte;
3616   int produced_chars = *p_nchars;
3617   int reg;                      /* graphic register number */
3618   int id = CHARSET_ID (charset);
3619
3620   /* At first, check designations.  */
3621   for (reg = 0; reg < 4; reg++)
3622     if (id == CODING_ISO_DESIGNATION (coding, reg))
3623       break;
3624
3625   if (reg >= 4)
3626     {
3627       /* CHARSET is not yet designated to any graphic registers.  */
3628       /* At first check the requested designation.  */
3629       reg = CODING_ISO_REQUEST (coding, id);
3630       if (reg < 0)
3631         /* Since CHARSET requests no special designation, designate it
3632            to graphic register 0.  */
3633         reg = 0;
3634
3635       ENCODE_DESIGNATION (charset, reg, coding);
3636     }
3637
3638   if (CODING_ISO_INVOCATION (coding, 0) != reg
3639       && CODING_ISO_INVOCATION (coding, 1) != reg)
3640     {
3641       /* Since the graphic register REG is not invoked to any graphic
3642          planes, invoke it to graphic plane 0.  */
3643       switch (reg)
3644         {
3645         case 0:                 /* graphic register 0 */
3646           ENCODE_SHIFT_IN;
3647           break;
3648
3649         case 1:                 /* graphic register 1 */
3650           ENCODE_SHIFT_OUT;
3651           break;
3652
3653         case 2:                 /* graphic register 2 */
3654           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3655             ENCODE_SINGLE_SHIFT_2;
3656           else
3657             ENCODE_LOCKING_SHIFT_2;
3658           break;
3659
3660         case 3:                 /* graphic register 3 */
3661           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3662             ENCODE_SINGLE_SHIFT_3;
3663           else
3664             ENCODE_LOCKING_SHIFT_3;
3665           break;
3666         }
3667     }
3668
3669   *p_nchars = produced_chars;
3670   return dst;
3671 }
3672
3673 /* The following three macros produce codes for indicating direction
3674    of text.  */
3675 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3676   do {                                                                  \
3677     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3678       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3679     else                                                                \
3680       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3681   } while (0)
3682
3683
3684 #define ENCODE_DIRECTION_R2L()                  \
3685   do {                                          \
3686     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3687     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3688   } while (0)
3689
3690
3691 #define ENCODE_DIRECTION_L2R()                  \
3692   do {                                          \
3693     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3694     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3695   } while (0)
3696
3697
3698 /* Produce codes for designation and invocation to reset the graphic
3699    planes and registers to initial state.  */
3700 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3701   do {                                                                  \
3702     int reg;                                                            \
3703     struct charset *charset;                                            \
3704                                                                         \
3705     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3706       ENCODE_SHIFT_IN;                                                  \
3707     for (reg = 0; reg < 4; reg++)                                       \
3708       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3709           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3710               != CODING_ISO_INITIAL (coding, reg)))                     \
3711         {                                                               \
3712           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3713           ENCODE_DESIGNATION (charset, reg, coding);                    \
3714         }                                                               \
3715   } while (0)
3716
3717
3718 /* Produce designation sequences of charsets in the line started from
3719    SRC to a place pointed by DST, and return updated DST.
3720
3721    If the current block ends before any end-of-line, we may fail to
3722    find all the necessary designations.  */
3723
3724 static unsigned char *
3725 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3726      struct coding_system *coding;
3727      int *charbuf, *charbuf_end;
3728      unsigned char *dst;
3729 {
3730   struct charset *charset;
3731   /* Table of charsets to be designated to each graphic register.  */
3732   int r[4];
3733   int c, found = 0, reg;
3734   int produced_chars = 0;
3735   int multibytep = coding->dst_multibyte;
3736   Lisp_Object attrs;
3737   Lisp_Object charset_list;
3738
3739   attrs = CODING_ID_ATTRS (coding->id);
3740   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3741   if (EQ (charset_list, Qiso_2022))
3742     charset_list = Viso_2022_charset_list;
3743
3744   for (reg = 0; reg < 4; reg++)
3745     r[reg] = -1;
3746
3747   while (found < 4)
3748     {
3749       int id;
3750
3751       c = *charbuf++;
3752       if (c == '\n')
3753         break;
3754       charset = char_charset (c, charset_list, NULL);
3755       id = CHARSET_ID (charset);
3756       reg = CODING_ISO_REQUEST (coding, id);
3757       if (reg >= 0 && r[reg] < 0)
3758         {
3759           found++;
3760           r[reg] = id;
3761         }
3762     }
3763
3764   if (found)
3765     {
3766       for (reg = 0; reg < 4; reg++)
3767         if (r[reg] >= 0
3768             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3769           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3770     }
3771
3772   return dst;
3773 }
3774
3775 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3776
3777 static int
3778 encode_coding_iso_2022 (coding)
3779      struct coding_system *coding;
3780 {
3781   int multibytep = coding->dst_multibyte;
3782   int *charbuf = coding->charbuf;
3783   int *charbuf_end = charbuf + coding->charbuf_used;
3784   unsigned char *dst = coding->destination + coding->produced;
3785   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3786   int safe_room = 16;
3787   int bol_designation
3788     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3789        && CODING_ISO_BOL (coding));
3790   int produced_chars = 0;
3791   Lisp_Object attrs, eol_type, charset_list;
3792   int ascii_compatible;
3793   int c;
3794   int preferred_charset_id = -1;
3795
3796   CODING_GET_INFO (coding, attrs, charset_list);
3797   eol_type = CODING_ID_EOL_TYPE (coding->id);
3798   if (VECTORP (eol_type))
3799     eol_type = Qunix;
3800
3801   setup_iso_safe_charsets (attrs);
3802   /* Charset list may have been changed.  */
3803   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3804   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3805
3806   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3807
3808   while (charbuf < charbuf_end)
3809     {
3810       ASSURE_DESTINATION (safe_room);
3811
3812       if (bol_designation)
3813         {
3814           unsigned char *dst_prev = dst;
3815
3816           /* We have to produce designation sequences if any now.  */
3817           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3818           bol_designation = 0;
3819           /* We are sure that designation sequences are all ASCII bytes.  */
3820           produced_chars += dst - dst_prev;
3821         }
3822
3823       c = *charbuf++;
3824
3825       if (c < 0)
3826         {
3827           /* Handle an annotation.  */
3828           switch (*charbuf)
3829             {
3830             case CODING_ANNOTATE_COMPOSITION_MASK:
3831               /* Not yet implemented.  */
3832               break;
3833             case CODING_ANNOTATE_CHARSET_MASK:
3834               preferred_charset_id = charbuf[2];
3835               if (preferred_charset_id >= 0
3836                   && NILP (Fmemq (make_number (preferred_charset_id),
3837                                   charset_list)))
3838                 preferred_charset_id = -1;
3839               break;
3840             default:
3841               abort ();
3842             }
3843           charbuf += -c - 1;
3844           continue;
3845         }
3846
3847       /* Now encode the character C.  */
3848       if (c < 0x20 || c == 0x7F)
3849         {
3850           if (c == '\n'
3851               || (c == '\r' && EQ (eol_type, Qmac)))
3852             {
3853               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3854                 ENCODE_RESET_PLANE_AND_REGISTER ();
3855               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3856                 {
3857                   int i;
3858
3859                   for (i = 0; i < 4; i++)
3860                     CODING_ISO_DESIGNATION (coding, i)
3861                       = CODING_ISO_INITIAL (coding, i);
3862                 }
3863               bol_designation
3864                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3865             }
3866           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3867             ENCODE_RESET_PLANE_AND_REGISTER ();
3868           EMIT_ONE_ASCII_BYTE (c);
3869         }
3870       else if (ASCII_CHAR_P (c))
3871         {
3872           if (ascii_compatible)
3873             EMIT_ONE_ASCII_BYTE (c);
3874           else
3875             {
3876               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3877               ENCODE_ISO_CHARACTER (charset, c);
3878             }
3879         }
3880       else if (CHAR_BYTE8_P (c))
3881         {
3882           c = CHAR_TO_BYTE8 (c);
3883           EMIT_ONE_BYTE (c);
3884         }
3885       else
3886         {
3887           struct charset *charset;
3888
3889           if (preferred_charset_id >= 0)
3890             {
3891               charset = CHARSET_FROM_ID (preferred_charset_id);
3892               if (! CHAR_CHARSET_P (c, charset))
3893                 charset = char_charset (c, charset_list, NULL);
3894             }
3895           else
3896             charset = char_charset (c, charset_list, NULL);
3897           if (!charset)
3898             {
3899               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3900                 {
3901                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3902                   charset = CHARSET_FROM_ID (charset_ascii);
3903                 }
3904               else
3905                 {
3906                   c = coding->default_char;
3907                   charset = char_charset (c, charset_list, NULL);
3908                 }
3909             }
3910           ENCODE_ISO_CHARACTER (charset, c);
3911         }
3912     }
3913
3914   if (coding->mode & CODING_MODE_LAST_BLOCK
3915       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3916     {
3917       ASSURE_DESTINATION (safe_room);
3918       ENCODE_RESET_PLANE_AND_REGISTER ();
3919     }
3920   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3921   CODING_ISO_BOL (coding) = bol_designation;
3922   coding->produced_char += produced_chars;
3923   coding->produced = dst - coding->destination;
3924   return 0;
3925 }
3926
3927 \f
3928 /*** 8,9. SJIS and BIG5 handlers ***/
3929
3930 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3931    quite widely.  So, for the moment, Emacs supports them in the bare
3932    C code.  But, in the future, they may be supported only by CCL.  */
3933
3934 /* SJIS is a coding system encoding three character sets: ASCII, right
3935    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3936    as is.  A character of charset katakana-jisx0201 is encoded by
3937    "position-code + 0x80".  A character of charset japanese-jisx0208
3938    is encoded in 2-byte but two position-codes are divided and shifted
3939    so that it fit in the range below.
3940
3941    --- CODE RANGE of SJIS ---
3942    (character set)      (range)
3943    ASCII                0x00 .. 0x7F
3944    KATAKANA-JISX0201    0xA0 .. 0xDF
3945    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3946             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3947    -------------------------------
3948
3949 */
3950
3951 /* BIG5 is a coding system encoding two character sets: ASCII and
3952    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3953    character set and is encoded in two-byte.
3954
3955    --- CODE RANGE of BIG5 ---
3956    (character set)      (range)
3957    ASCII                0x00 .. 0x7F
3958    Big5 (1st byte)      0xA1 .. 0xFE
3959         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3960    --------------------------
3961
3962   */
3963
3964 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3965    Check if a text is encoded in SJIS.  If it is, return
3966    CATEGORY_MASK_SJIS, else return 0.  */
3967
3968 static int
3969 detect_coding_sjis (coding, detect_info)
3970      struct coding_system *coding;
3971      struct coding_detection_info *detect_info;
3972 {
3973   const unsigned char *src = coding->source, *src_base;
3974   const unsigned char *src_end = coding->source + coding->src_bytes;
3975   int multibytep = coding->src_multibyte;
3976   int consumed_chars = 0;
3977   int found = 0;
3978   int c;
3979
3980   detect_info->checked |= CATEGORY_MASK_SJIS;
3981   /* A coding system of this category is always ASCII compatible.  */
3982   src += coding->head_ascii;
3983
3984   while (1)
3985     {
3986       src_base = src;
3987       ONE_MORE_BYTE (c);
3988       if (c < 0x80)
3989         continue;
3990       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3991         {
3992           ONE_MORE_BYTE (c);
3993           if (c < 0x40 || c == 0x7F || c > 0xFC)
3994             break;
3995           found = CATEGORY_MASK_SJIS;
3996         }
3997       else if (c >= 0xA0 && c < 0xE0)
3998         found = CATEGORY_MASK_SJIS;
3999       else
4000         break;
4001     }
4002   detect_info->rejected |= CATEGORY_MASK_SJIS;
4003   return 0;
4004
4005  no_more_source:
4006   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4007     {
4008       detect_info->rejected |= CATEGORY_MASK_SJIS;
4009       return 0;
4010     }
4011   detect_info->found |= found;
4012   return 1;
4013 }
4014
4015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4016    Check if a text is encoded in BIG5.  If it is, return
4017    CATEGORY_MASK_BIG5, else return 0.  */
4018
4019 static int
4020 detect_coding_big5 (coding, detect_info)
4021      struct coding_system *coding;
4022      struct coding_detection_info *detect_info;
4023 {
4024   const unsigned char *src = coding->source, *src_base;
4025   const unsigned char *src_end = coding->source + coding->src_bytes;
4026   int multibytep = coding->src_multibyte;
4027   int consumed_chars = 0;
4028   int found = 0;
4029   int c;
4030
4031   detect_info->checked |= CATEGORY_MASK_BIG5;
4032   /* A coding system of this category is always ASCII compatible.  */
4033   src += coding->head_ascii;
4034
4035   while (1)
4036     {
4037       src_base = src;
4038       ONE_MORE_BYTE (c);
4039       if (c < 0x80)
4040         continue;
4041       if (c >= 0xA1)
4042         {
4043           ONE_MORE_BYTE (c);
4044           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4045             return 0;
4046           found = CATEGORY_MASK_BIG5;
4047         }
4048       else
4049         break;
4050     }
4051   detect_info->rejected |= CATEGORY_MASK_BIG5;
4052   return 0;
4053
4054  no_more_source:
4055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4056     {
4057       detect_info->rejected |= CATEGORY_MASK_BIG5;
4058       return 0;
4059     }
4060   detect_info->found |= found;
4061   return 1;
4062 }
4063
4064 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4065    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4066
4067 static void
4068 decode_coding_sjis (coding)
4069      struct coding_system *coding;
4070 {
4071   const unsigned char *src = coding->source + coding->consumed;
4072   const unsigned char *src_end = coding->source + coding->src_bytes;
4073   const unsigned char *src_base;
4074   int *charbuf = coding->charbuf + coding->charbuf_used;
4075   int *charbuf_end
4076     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4077   int consumed_chars = 0, consumed_chars_base;
4078   int multibytep = coding->src_multibyte;
4079   struct charset *charset_roman, *charset_kanji, *charset_kana;
4080   struct charset *charset_kanji2;
4081   Lisp_Object attrs, charset_list, val;
4082   int char_offset = coding->produced_char;
4083   int last_offset = char_offset;
4084   int last_id = charset_ascii;
4085
4086   CODING_GET_INFO (coding, attrs, charset_list);
4087
4088   val = charset_list;
4089   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4090   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4091   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4092   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4093
4094   while (1)
4095     {
4096       int c, c1;
4097       struct charset *charset;
4098
4099       src_base = src;
4100       consumed_chars_base = consumed_chars;
4101
4102       if (charbuf >= charbuf_end)
4103         break;
4104
4105       ONE_MORE_BYTE (c);
4106       if (c < 0)
4107         goto invalid_code;
4108       if (c < 0x80)
4109         charset = charset_roman;
4110       else if (c == 0x80 || c == 0xA0)
4111         goto invalid_code;
4112       else if (c >= 0xA1 && c <= 0xDF)
4113         {
4114           /* SJIS -> JISX0201-Kana */
4115           c &= 0x7F;
4116           charset = charset_kana;
4117         }
4118       else if (c <= 0xEF)
4119         {
4120           /* SJIS -> JISX0208 */
4121           ONE_MORE_BYTE (c1);
4122           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4123             goto invalid_code;
4124           c = (c << 8) | c1;
4125           SJIS_TO_JIS (c);
4126           charset = charset_kanji;
4127         }
4128       else if (c <= 0xFC && charset_kanji2)
4129         {
4130           /* SJIS -> JISX0213-2 */
4131           ONE_MORE_BYTE (c1);
4132           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4133             goto invalid_code;
4134           c = (c << 8) | c1;
4135           SJIS_TO_JIS2 (c);
4136           charset = charset_kanji2;
4137         }
4138       else
4139         goto invalid_code;
4140       if (charset->id != charset_ascii
4141           && last_id != charset->id)
4142         {
4143           if (last_id != charset_ascii)
4144             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4145           last_id = charset->id;
4146           last_offset = char_offset;
4147         }
4148       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4149       *charbuf++ = c;
4150       char_offset++;
4151       continue;
4152
4153     invalid_code:
4154       src = src_base;
4155       consumed_chars = consumed_chars_base;
4156       ONE_MORE_BYTE (c);
4157       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4158       char_offset++;
4159       coding->errors++;
4160     }
4161
4162  no_more_source:
4163   if (last_id != charset_ascii)
4164     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4165   coding->consumed_char += consumed_chars_base;
4166   coding->consumed = src_base - coding->source;
4167   coding->charbuf_used = charbuf - coding->charbuf;
4168 }
4169
4170 static void
4171 decode_coding_big5 (coding)
4172      struct coding_system *coding;
4173 {
4174   const unsigned char *src = coding->source + coding->consumed;
4175   const unsigned char *src_end = coding->source + coding->src_bytes;
4176   const unsigned char *src_base;
4177   int *charbuf = coding->charbuf + coding->charbuf_used;
4178   int *charbuf_end
4179     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4180   int consumed_chars = 0, consumed_chars_base;
4181   int multibytep = coding->src_multibyte;
4182   struct charset *charset_roman, *charset_big5;
4183   Lisp_Object attrs, charset_list, val;
4184   int char_offset = coding->produced_char;
4185   int last_offset = char_offset;
4186   int last_id = charset_ascii;
4187
4188   CODING_GET_INFO (coding, attrs, charset_list);
4189   val = charset_list;
4190   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4191   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4192
4193   while (1)
4194     {
4195       int c, c1;
4196       struct charset *charset;
4197
4198       src_base = src;
4199       consumed_chars_base = consumed_chars;
4200
4201       if (charbuf >= charbuf_end)
4202         break;
4203
4204       ONE_MORE_BYTE (c);
4205
4206       if (c < 0)
4207         goto invalid_code;
4208       if (c < 0x80)
4209         charset = charset_roman;
4210       else
4211         {
4212           /* BIG5 -> Big5 */
4213           if (c < 0xA1 || c > 0xFE)
4214             goto invalid_code;
4215           ONE_MORE_BYTE (c1);
4216           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4217             goto invalid_code;
4218           c = c << 8 | c1;
4219           charset = charset_big5;
4220         }
4221       if (charset->id != charset_ascii
4222           && last_id != charset->id)
4223         {
4224           if (last_id != charset_ascii)
4225             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4226           last_id = charset->id;
4227           last_offset = char_offset;
4228         }
4229       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4230       *charbuf++ = c;
4231       char_offset++;
4232       continue;
4233
4234     invalid_code:
4235       src = src_base;
4236       consumed_chars = consumed_chars_base;
4237       ONE_MORE_BYTE (c);
4238       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4239       char_offset++;
4240       coding->errors++;
4241     }
4242
4243  no_more_source:
4244   if (last_id != charset_ascii)
4245     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4246   coding->consumed_char += consumed_chars_base;
4247   coding->consumed = src_base - coding->source;
4248   coding->charbuf_used = charbuf - coding->charbuf;
4249 }
4250
4251 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4252    This function can encode charsets `ascii', `katakana-jisx0201',
4253    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4254    are sure that all these charsets are registered as official charset
4255    (i.e. do not have extended leading-codes).  Characters of other
4256    charsets are produced without any encoding.  If SJIS_P is 1, encode
4257    SJIS text, else encode BIG5 text.  */
4258
4259 static int
4260 encode_coding_sjis (coding)
4261      struct coding_system *coding;
4262 {
4263   int multibytep = coding->dst_multibyte;
4264   int *charbuf = coding->charbuf;
4265   int *charbuf_end = charbuf + coding->charbuf_used;
4266   unsigned char *dst = coding->destination + coding->produced;
4267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4268   int safe_room = 4;
4269   int produced_chars = 0;
4270   Lisp_Object attrs, charset_list, val;
4271   int ascii_compatible;
4272   struct charset *charset_roman, *charset_kanji, *charset_kana;
4273   struct charset *charset_kanji2;
4274   int c;
4275
4276   CODING_GET_INFO (coding, attrs, charset_list);
4277   val = charset_list;
4278   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4279   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4280   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4281   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4282
4283   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4284
4285   while (charbuf < charbuf_end)
4286     {
4287       ASSURE_DESTINATION (safe_room);
4288       c = *charbuf++;
4289       /* Now encode the character C.  */
4290       if (ASCII_CHAR_P (c) && ascii_compatible)
4291         EMIT_ONE_ASCII_BYTE (c);
4292       else if (CHAR_BYTE8_P (c))
4293         {
4294           c = CHAR_TO_BYTE8 (c);
4295           EMIT_ONE_BYTE (c);
4296         }
4297       else
4298         {
4299           unsigned code;
4300           struct charset *charset = char_charset (c, charset_list, &code);
4301
4302           if (!charset)
4303             {
4304               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4305                 {
4306                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4307                   charset = CHARSET_FROM_ID (charset_ascii);
4308                 }
4309               else
4310                 {
4311                   c = coding->default_char;
4312                   charset = char_charset (c, charset_list, &code);
4313                 }
4314             }
4315           if (code == CHARSET_INVALID_CODE (charset))
4316             abort ();
4317           if (charset == charset_kanji)
4318             {
4319               int c1, c2;
4320               JIS_TO_SJIS (code);
4321               c1 = code >> 8, c2 = code & 0xFF;
4322               EMIT_TWO_BYTES (c1, c2);
4323             }
4324           else if (charset == charset_kana)
4325             EMIT_ONE_BYTE (code | 0x80);
4326           else if (charset_kanji2 && charset == charset_kanji2)
4327             {
4328               int c1, c2;
4329
4330               c1 = code >> 8;
4331               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4332                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4333                 {
4334                   JIS_TO_SJIS2 (code);
4335                   c1 = code >> 8, c2 = code & 0xFF;
4336                   EMIT_TWO_BYTES (c1, c2);
4337                 }
4338               else
4339                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4340             }
4341           else
4342             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4343         }
4344     }
4345   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4346   coding->produced_char += produced_chars;
4347   coding->produced = dst - coding->destination;
4348   return 0;
4349 }
4350
4351 static int
4352 encode_coding_big5 (coding)
4353      struct coding_system *coding;
4354 {
4355   int multibytep = coding->dst_multibyte;
4356   int *charbuf = coding->charbuf;
4357   int *charbuf_end = charbuf + coding->charbuf_used;
4358   unsigned char *dst = coding->destination + coding->produced;
4359   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4360   int safe_room = 4;
4361   int produced_chars = 0;
4362   Lisp_Object attrs, charset_list, val;
4363   int ascii_compatible;
4364   struct charset *charset_roman, *charset_big5;
4365   int c;
4366
4367   CODING_GET_INFO (coding, attrs, charset_list);
4368   val = charset_list;
4369   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4370   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4371   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4372
4373   while (charbuf < charbuf_end)
4374     {
4375       ASSURE_DESTINATION (safe_room);
4376       c = *charbuf++;
4377       /* Now encode the character C.  */
4378       if (ASCII_CHAR_P (c) && ascii_compatible)
4379         EMIT_ONE_ASCII_BYTE (c);
4380       else if (CHAR_BYTE8_P (c))
4381         {
4382           c = CHAR_TO_BYTE8 (c);
4383           EMIT_ONE_BYTE (c);
4384         }
4385       else
4386         {
4387           unsigned code;
4388           struct charset *charset = char_charset (c, charset_list, &code);
4389
4390           if (! charset)
4391             {
4392               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4393                 {
4394                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4395                   charset = CHARSET_FROM_ID (charset_ascii);
4396                 }
4397               else
4398                 {
4399                   c = coding->default_char;
4400                   charset = char_charset (c, charset_list, &code);
4401                 }
4402             }
4403           if (code == CHARSET_INVALID_CODE (charset))
4404             abort ();
4405           if (charset == charset_big5)
4406             {
4407               int c1, c2;
4408
4409               c1 = code >> 8, c2 = code & 0xFF;
4410               EMIT_TWO_BYTES (c1, c2);
4411             }
4412           else
4413             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4414         }
4415     }
4416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4417   coding->produced_char += produced_chars;
4418   coding->produced = dst - coding->destination;
4419   return 0;
4420 }
4421
4422 \f
4423 /*** 10. CCL handlers ***/
4424
4425 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4426    Check if a text is encoded in a coding system of which
4427    encoder/decoder are written in CCL program.  If it is, return
4428    CATEGORY_MASK_CCL, else return 0.  */
4429
4430 static int
4431 detect_coding_ccl (coding, detect_info)
4432      struct coding_system *coding;
4433      struct coding_detection_info *detect_info;
4434 {
4435   const unsigned char *src = coding->source, *src_base;
4436   const unsigned char *src_end = coding->source + coding->src_bytes;
4437   int multibytep = coding->src_multibyte;
4438   int consumed_chars = 0;
4439   int found = 0;
4440   unsigned char *valids;
4441   int head_ascii = coding->head_ascii;
4442   Lisp_Object attrs;
4443
4444   detect_info->checked |= CATEGORY_MASK_CCL;
4445
4446   coding = &coding_categories[coding_category_ccl];
4447   valids = CODING_CCL_VALIDS (coding);
4448   attrs = CODING_ID_ATTRS (coding->id);
4449   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4450     src += head_ascii;
4451
4452   while (1)
4453     {
4454       int c;
4455
4456       src_base = src;
4457       ONE_MORE_BYTE (c);
4458       if (c < 0 || ! valids[c])
4459         break;
4460       if ((valids[c] > 1))
4461         found = CATEGORY_MASK_CCL;
4462     }
4463   detect_info->rejected |= CATEGORY_MASK_CCL;
4464   return 0;
4465
4466  no_more_source:
4467   detect_info->found |= found;
4468   return 1;
4469 }
4470
4471 static void
4472 decode_coding_ccl (coding)
4473      struct coding_system *coding;
4474 {
4475   const unsigned char *src = coding->source + coding->consumed;
4476   const unsigned char *src_end = coding->source + coding->src_bytes;
4477   int *charbuf = coding->charbuf + coding->charbuf_used;
4478   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4479   int consumed_chars = 0;
4480   int multibytep = coding->src_multibyte;
4481   struct ccl_program ccl;
4482   int source_charbuf[1024];
4483   int source_byteidx[1024];
4484   Lisp_Object attrs, charset_list;
4485
4486   CODING_GET_INFO (coding, attrs, charset_list);
4487   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4488
4489   while (src < src_end)
4490     {
4491       const unsigned char *p = src;
4492       int *source, *source_end;
4493       int i = 0;
4494
4495       if (multibytep)
4496         while (i < 1024 && p < src_end)
4497           {
4498             source_byteidx[i] = p - src;
4499             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4500           }
4501       else
4502         while (i < 1024 && p < src_end)
4503           source_charbuf[i++] = *p++;
4504
4505       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4506         ccl.last_block = 1;
4507
4508       source = source_charbuf;
4509       source_end = source + i;
4510       while (source < source_end)
4511         {
4512           ccl_driver (&ccl, source, charbuf,
4513                       source_end - source, charbuf_end - charbuf,
4514                       charset_list);
4515           source += ccl.consumed;
4516           charbuf += ccl.produced;
4517           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4518             break;
4519         }
4520       if (source < source_end)
4521         src += source_byteidx[source - source_charbuf];
4522       else
4523         src = p;
4524       consumed_chars += source - source_charbuf;
4525
4526       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4527           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4528         break;
4529     }
4530
4531   switch (ccl.status)
4532     {
4533     case CCL_STAT_SUSPEND_BY_SRC:
4534       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4535       break;
4536     case CCL_STAT_SUSPEND_BY_DST:
4537       break;
4538     case CCL_STAT_QUIT:
4539     case CCL_STAT_INVALID_CMD:
4540       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4541       break;
4542     default:
4543       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4544       break;
4545     }
4546   coding->consumed_char += consumed_chars;
4547   coding->consumed = src - coding->source;
4548   coding->charbuf_used = charbuf - coding->charbuf;
4549 }
4550
4551 static int
4552 encode_coding_ccl (coding)
4553      struct coding_system *coding;
4554 {
4555   struct ccl_program ccl;
4556   int multibytep = coding->dst_multibyte;
4557   int *charbuf = coding->charbuf;
4558   int *charbuf_end = charbuf + coding->charbuf_used;
4559   unsigned char *dst = coding->destination + coding->produced;
4560   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4561   int destination_charbuf[1024];
4562   int i, produced_chars = 0;
4563   Lisp_Object attrs, charset_list;
4564
4565   CODING_GET_INFO (coding, attrs, charset_list);
4566   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4567
4568   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4569   ccl.dst_multibyte = coding->dst_multibyte;
4570
4571   while (charbuf < charbuf_end)
4572     {
4573       ccl_driver (&ccl, charbuf, destination_charbuf,
4574                   charbuf_end - charbuf, 1024, charset_list);
4575       if (multibytep)
4576         {
4577           ASSURE_DESTINATION (ccl.produced * 2);
4578           for (i = 0; i < ccl.produced; i++)
4579             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4580         }
4581       else
4582         {
4583           ASSURE_DESTINATION (ccl.produced);
4584           for (i = 0; i < ccl.produced; i++)
4585             *dst++ = destination_charbuf[i] & 0xFF;
4586           produced_chars += ccl.produced;
4587         }
4588       charbuf += ccl.consumed;
4589       if (ccl.status == CCL_STAT_QUIT
4590           || ccl.status == CCL_STAT_INVALID_CMD)
4591         break;
4592     }
4593
4594   switch (ccl.status)
4595     {
4596     case CCL_STAT_SUSPEND_BY_SRC:
4597       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4598       break;
4599     case CCL_STAT_SUSPEND_BY_DST:
4600       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4601       break;
4602     case CCL_STAT_QUIT:
4603     case CCL_STAT_INVALID_CMD:
4604       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4605       break;
4606     default:
4607       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4608       break;
4609     }
4610
4611   coding->produced_char += produced_chars;
4612   coding->produced = dst - coding->destination;
4613   return 0;
4614 }
4615
4616
4617 \f
4618 /*** 10, 11. no-conversion handlers ***/
4619
4620 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4621
4622 static void
4623 decode_coding_raw_text (coding)
4624      struct coding_system *coding;
4625 {
4626   coding->chars_at_source = 1;
4627   coding->consumed_char = 0;
4628   coding->consumed = 0;
4629   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4630 }
4631
4632 static int
4633 encode_coding_raw_text (coding)
4634      struct coding_system *coding;
4635 {
4636   int multibytep = coding->dst_multibyte;
4637   int *charbuf = coding->charbuf;
4638   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4639   unsigned char *dst = coding->destination + coding->produced;
4640   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4641   int produced_chars = 0;
4642   int c;
4643
4644   if (multibytep)
4645     {
4646       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4647
4648       if (coding->src_multibyte)
4649         while (charbuf < charbuf_end)
4650           {
4651             ASSURE_DESTINATION (safe_room);
4652             c = *charbuf++;
4653             if (ASCII_CHAR_P (c))
4654               EMIT_ONE_ASCII_BYTE (c);
4655             else if (CHAR_BYTE8_P (c))
4656               {
4657                 c = CHAR_TO_BYTE8 (c);
4658                 EMIT_ONE_BYTE (c);
4659               }
4660             else
4661               {
4662                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4663
4664                 CHAR_STRING_ADVANCE (c, p1);
4665                 while (p0 < p1)
4666                   {
4667                     EMIT_ONE_BYTE (*p0);
4668                     p0++;
4669                   }
4670               }
4671           }
4672       else
4673         while (charbuf < charbuf_end)
4674           {
4675             ASSURE_DESTINATION (safe_room);
4676             c = *charbuf++;
4677             EMIT_ONE_BYTE (c);
4678           }
4679     }
4680   else
4681     {
4682       if (coding->src_multibyte)
4683         {
4684           int safe_room = MAX_MULTIBYTE_LENGTH;
4685
4686           while (charbuf < charbuf_end)
4687             {
4688               ASSURE_DESTINATION (safe_room);
4689               c = *charbuf++;
4690               if (ASCII_CHAR_P (c))
4691                 *dst++ = c;
4692               else if (CHAR_BYTE8_P (c))
4693                 *dst++ = CHAR_TO_BYTE8 (c);
4694               else
4695                 CHAR_STRING_ADVANCE (c, dst);
4696               produced_chars++;
4697             }
4698         }
4699       else
4700         {
4701           ASSURE_DESTINATION (charbuf_end - charbuf);
4702           while (charbuf < charbuf_end && dst < dst_end)
4703             *dst++ = *charbuf++;
4704           produced_chars = dst - (coding->destination + coding->dst_bytes);
4705         }
4706     }
4707   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4708   coding->produced_char += produced_chars;
4709   coding->produced = dst - coding->destination;
4710   return 0;
4711 }
4712
4713 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4714    Check if a text is encoded in a charset-based coding system.  If it
4715    is, return 1, else return 0.  */
4716
4717 static int
4718 detect_coding_charset (coding, detect_info)
4719      struct coding_system *coding;
4720      struct coding_detection_info *detect_info;
4721 {
4722   const unsigned char *src = coding->source, *src_base;
4723   const unsigned char *src_end = coding->source + coding->src_bytes;
4724   int multibytep = coding->src_multibyte;
4725   int consumed_chars = 0;
4726   Lisp_Object attrs, valids;
4727   int found = 0;
4728
4729   detect_info->checked |= CATEGORY_MASK_CHARSET;
4730
4731   coding = &coding_categories[coding_category_charset];
4732   attrs = CODING_ID_ATTRS (coding->id);
4733   valids = AREF (attrs, coding_attr_charset_valids);
4734
4735   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4736     src += coding->head_ascii;
4737
4738   while (1)
4739     {
4740       int c;
4741
4742       src_base = src;
4743       ONE_MORE_BYTE (c);
4744       if (c < 0)
4745         continue;
4746       if (NILP (AREF (valids, c)))
4747         break;
4748       if (c >= 0x80)
4749         found = CATEGORY_MASK_CHARSET;
4750     }
4751   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4752   return 0;
4753
4754  no_more_source:
4755   detect_info->found |= found;
4756   return 1;
4757 }
4758
4759 static void
4760 decode_coding_charset (coding)
4761      struct coding_system *coding;
4762 {
4763   const unsigned char *src = coding->source + coding->consumed;
4764   const unsigned char *src_end = coding->source + coding->src_bytes;
4765   const unsigned char *src_base;
4766   int *charbuf = coding->charbuf + coding->charbuf_used;
4767   int *charbuf_end
4768     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4769   int consumed_chars = 0, consumed_chars_base;
4770   int multibytep = coding->src_multibyte;
4771   Lisp_Object attrs, charset_list, valids;
4772   int char_offset = coding->produced_char;
4773   int last_offset = char_offset;
4774   int last_id = charset_ascii;
4775
4776   CODING_GET_INFO (coding, attrs, charset_list);
4777   valids = AREF (attrs, coding_attr_charset_valids);
4778
4779   while (1)
4780     {
4781       int c;
4782       Lisp_Object val;
4783       struct charset *charset;
4784       int dim;
4785       int len = 1;
4786       unsigned code;
4787
4788       src_base = src;
4789       consumed_chars_base = consumed_chars;
4790
4791       if (charbuf >= charbuf_end)
4792         break;
4793
4794       ONE_MORE_BYTE (c);
4795       if (c < 0)
4796         goto invalid_code;
4797       code = c;
4798
4799       val = AREF (valids, c);
4800       if (NILP (val))
4801         goto invalid_code;
4802       if (INTEGERP (val))
4803         {
4804           charset = CHARSET_FROM_ID (XFASTINT (val));
4805           dim = CHARSET_DIMENSION (charset);
4806           while (len < dim)
4807             {
4808               ONE_MORE_BYTE (c);
4809               code = (code << 8) | c;
4810               len++;
4811             }
4812           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4813                               charset, code, c);
4814         }
4815       else
4816         {
4817           /* VAL is a list of charset IDs.  It is assured that the
4818              list is sorted by charset dimensions (smaller one
4819              comes first).  */
4820           while (CONSP (val))
4821             {
4822               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4823               dim = CHARSET_DIMENSION (charset);
4824               while (len < dim)
4825                 {
4826                   ONE_MORE_BYTE (c);
4827                   code = (code << 8) | c;
4828                   len++;
4829                 }
4830               CODING_DECODE_CHAR (coding, src, src_base,
4831                                   src_end, charset, code, c);
4832               if (c >= 0)
4833                 break;
4834               val = XCDR (val);
4835             }
4836         }
4837       if (c < 0)
4838         goto invalid_code;
4839       if (charset->id != charset_ascii
4840           && last_id != charset->id)
4841         {
4842           if (last_id != charset_ascii)
4843             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4844           last_id = charset->id;
4845           last_offset = char_offset;
4846         }
4847
4848       *charbuf++ = c;
4849       char_offset++;
4850       continue;
4851
4852     invalid_code:
4853       src = src_base;
4854       consumed_chars = consumed_chars_base;
4855       ONE_MORE_BYTE (c);
4856       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4857       char_offset++;
4858       coding->errors++;
4859     }
4860
4861  no_more_source:
4862   if (last_id != charset_ascii)
4863     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4864   coding->consumed_char += consumed_chars_base;
4865   coding->consumed = src_base - coding->source;
4866   coding->charbuf_used = charbuf - coding->charbuf;
4867 }
4868
4869 static int
4870 encode_coding_charset (coding)
4871      struct coding_system *coding;
4872 {
4873   int multibytep = coding->dst_multibyte;
4874   int *charbuf = coding->charbuf;
4875   int *charbuf_end = charbuf + coding->charbuf_used;
4876   unsigned char *dst = coding->destination + coding->produced;
4877   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4878   int safe_room = MAX_MULTIBYTE_LENGTH;
4879   int produced_chars = 0;
4880   Lisp_Object attrs, charset_list;
4881   int ascii_compatible;
4882   int c;
4883
4884   CODING_GET_INFO (coding, attrs, charset_list);
4885   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4886
4887   while (charbuf < charbuf_end)
4888     {
4889       struct charset *charset;
4890       unsigned code;
4891
4892       ASSURE_DESTINATION (safe_room);
4893       c = *charbuf++;
4894       if (ascii_compatible && ASCII_CHAR_P (c))
4895         EMIT_ONE_ASCII_BYTE (c);
4896       else if (CHAR_BYTE8_P (c))
4897         {
4898           c = CHAR_TO_BYTE8 (c);
4899           EMIT_ONE_BYTE (c);
4900         }
4901       else
4902         {
4903           charset = char_charset (c, charset_list, &code);
4904           if (charset)
4905             {
4906               if (CHARSET_DIMENSION (charset) == 1)
4907                 EMIT_ONE_BYTE (code);
4908               else if (CHARSET_DIMENSION (charset) == 2)
4909                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4910               else if (CHARSET_DIMENSION (charset) == 3)
4911                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4912               else
4913                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4914                                  (code >> 8) & 0xFF, code & 0xFF);
4915             }
4916           else
4917             {
4918               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4919                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4920               else
4921                 c = coding->default_char;
4922               EMIT_ONE_BYTE (c);
4923             }
4924         }
4925     }
4926
4927   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4928   coding->produced_char += produced_chars;
4929   coding->produced = dst - coding->destination;
4930   return 0;
4931 }
4932
4933 \f
4934 /*** 7. C library functions ***/
4935
4936 /* Setup coding context CODING from information about CODING_SYSTEM.
4937    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4938    CODING_SYSTEM is invalid, signal an error.  */
4939
4940 void
4941 setup_coding_system (coding_system, coding)
4942      Lisp_Object coding_system;
4943      struct coding_system *coding;
4944 {
4945   Lisp_Object attrs;
4946   Lisp_Object eol_type;
4947   Lisp_Object coding_type;
4948   Lisp_Object val;
4949
4950   if (NILP (coding_system))
4951     coding_system = Qundecided;
4952
4953   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4954
4955   attrs = CODING_ID_ATTRS (coding->id);
4956   eol_type = CODING_ID_EOL_TYPE (coding->id);
4957
4958   coding->mode = 0;
4959   coding->head_ascii = -1;
4960   coding->common_flags
4961     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4962   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4963     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4964   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4965     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4966   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4967     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4968
4969   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4970   coding->max_charset_id = SCHARS (val) - 1;
4971   coding->safe_charsets = (char *) SDATA (val);
4972   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4973
4974   coding_type = CODING_ATTR_TYPE (attrs);
4975   if (EQ (coding_type, Qundecided))
4976     {
4977       coding->detector = NULL;
4978       coding->decoder = decode_coding_raw_text;
4979       coding->encoder = encode_coding_raw_text;
4980       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4981     }
4982   else if (EQ (coding_type, Qiso_2022))
4983     {
4984       int i;
4985       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4986
4987       /* Invoke graphic register 0 to plane 0.  */
4988       CODING_ISO_INVOCATION (coding, 0) = 0;
4989       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4990       CODING_ISO_INVOCATION (coding, 1)
4991         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4992       /* Setup the initial status of designation.  */
4993       for (i = 0; i < 4; i++)
4994         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4995       /* Not single shifting initially.  */
4996       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4997       /* Beginning of buffer should also be regarded as bol. */
4998       CODING_ISO_BOL (coding) = 1;
4999       coding->detector = detect_coding_iso_2022;
5000       coding->decoder = decode_coding_iso_2022;
5001       coding->encoder = encode_coding_iso_2022;
5002       if (flags & CODING_ISO_FLAG_SAFE)
5003         coding->mode |= CODING_MODE_SAFE_ENCODING;
5004       coding->common_flags
5005         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5006             | CODING_REQUIRE_FLUSHING_MASK);
5007       if (flags & CODING_ISO_FLAG_COMPOSITION)
5008         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5009       if (flags & CODING_ISO_FLAG_DESIGNATION)
5010         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5011       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5012         {
5013           setup_iso_safe_charsets (attrs);
5014           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5015           coding->max_charset_id = SCHARS (val) - 1;
5016           coding->safe_charsets = (char *) SDATA (val);
5017         }
5018       CODING_ISO_FLAGS (coding) = flags;
5019     }
5020   else if (EQ (coding_type, Qcharset))
5021     {
5022       coding->detector = detect_coding_charset;
5023       coding->decoder = decode_coding_charset;
5024       coding->encoder = encode_coding_charset;
5025       coding->common_flags
5026         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5027     }
5028   else if (EQ (coding_type, Qutf_8))
5029     {
5030       coding->detector = detect_coding_utf_8;
5031       coding->decoder = decode_coding_utf_8;
5032       coding->encoder = encode_coding_utf_8;
5033       coding->common_flags
5034         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5035     }
5036   else if (EQ (coding_type, Qutf_16))
5037     {
5038       val = AREF (attrs, coding_attr_utf_16_bom);
5039       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5040                                     : EQ (val, Qt) ? utf_16_with_bom
5041                                     : utf_16_without_bom);
5042       val = AREF (attrs, coding_attr_utf_16_endian);
5043       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5044                                        : utf_16_little_endian);
5045       CODING_UTF_16_SURROGATE (coding) = 0;
5046       coding->detector = detect_coding_utf_16;
5047       coding->decoder = decode_coding_utf_16;
5048       coding->encoder = encode_coding_utf_16;
5049       coding->common_flags
5050         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5051       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5052         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5053     }
5054   else if (EQ (coding_type, Qccl))
5055     {
5056       coding->detector = detect_coding_ccl;
5057       coding->decoder = decode_coding_ccl;
5058       coding->encoder = encode_coding_ccl;
5059       coding->common_flags
5060         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5061             | CODING_REQUIRE_FLUSHING_MASK);
5062     }
5063   else if (EQ (coding_type, Qemacs_mule))
5064     {
5065       coding->detector = detect_coding_emacs_mule;
5066       coding->decoder = decode_coding_emacs_mule;
5067       coding->encoder = encode_coding_emacs_mule;
5068       coding->common_flags
5069         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5070       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5071           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5072         {
5073           Lisp_Object tail, safe_charsets;
5074           int max_charset_id = 0;
5075
5076           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5077                tail = XCDR (tail))
5078             if (max_charset_id < XFASTINT (XCAR (tail)))
5079               max_charset_id = XFASTINT (XCAR (tail));
5080           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5081                                         make_number (255));
5082           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5083                tail = XCDR (tail))
5084             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5085           coding->max_charset_id = max_charset_id;
5086           coding->safe_charsets = (char *) SDATA (safe_charsets);
5087         }
5088     }
5089   else if (EQ (coding_type, Qshift_jis))
5090     {
5091       coding->detector = detect_coding_sjis;
5092       coding->decoder = decode_coding_sjis;
5093       coding->encoder = encode_coding_sjis;
5094       coding->common_flags
5095         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5096     }
5097   else if (EQ (coding_type, Qbig5))
5098     {
5099       coding->detector = detect_coding_big5;
5100       coding->decoder = decode_coding_big5;
5101       coding->encoder = encode_coding_big5;
5102       coding->common_flags
5103         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5104     }
5105   else                          /* EQ (coding_type, Qraw_text) */
5106     {
5107       coding->detector = NULL;
5108       coding->decoder = decode_coding_raw_text;
5109       coding->encoder = encode_coding_raw_text;
5110       if (! EQ (eol_type, Qunix))
5111         {
5112           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5113           if (! VECTORP (eol_type))
5114             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5115         }
5116
5117     }
5118
5119   return;
5120 }
5121
5122 /* Return a list of charsets supported by CODING.  */
5123
5124 Lisp_Object
5125 coding_charset_list (coding)
5126      struct coding_system *coding;
5127 {
5128   Lisp_Object attrs, charset_list;
5129
5130   CODING_GET_INFO (coding, attrs, charset_list);
5131   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5132     {
5133       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5134
5135       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5136         charset_list = Viso_2022_charset_list;
5137     }
5138   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5139     {
5140       charset_list = Vemacs_mule_charset_list;
5141     }
5142   return charset_list;
5143 }
5144
5145
5146 /* Return raw-text or one of its subsidiaries that has the same
5147    eol_type as CODING-SYSTEM.  */
5148
5149 Lisp_Object
5150 raw_text_coding_system (coding_system)
5151      Lisp_Object coding_system;
5152 {
5153   Lisp_Object spec, attrs;
5154   Lisp_Object eol_type, raw_text_eol_type;
5155
5156   if (NILP (coding_system))
5157     return Qraw_text;
5158   spec = CODING_SYSTEM_SPEC (coding_system);
5159   attrs = AREF (spec, 0);
5160
5161   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5162     return coding_system;
5163
5164   eol_type = AREF (spec, 2);
5165   if (VECTORP (eol_type))
5166     return Qraw_text;
5167   spec = CODING_SYSTEM_SPEC (Qraw_text);
5168   raw_text_eol_type = AREF (spec, 2);
5169   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5170           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5171           : AREF (raw_text_eol_type, 2));
5172 }
5173
5174
5175 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5176    does, return one of the subsidiary that has the same eol-spec as
5177    PARENT.  Otherwise, return CODING_SYSTEM.  */
5178
5179 Lisp_Object
5180 coding_inherit_eol_type (coding_system, parent)
5181      Lisp_Object coding_system, parent;
5182 {
5183   Lisp_Object spec, eol_type;
5184
5185   if (NILP (coding_system))
5186     coding_system = Qraw_text;
5187   spec = CODING_SYSTEM_SPEC (coding_system);
5188   eol_type = AREF (spec, 2);
5189   if (VECTORP (eol_type)
5190       && ! NILP (parent))
5191     {
5192       Lisp_Object parent_spec;
5193       Lisp_Object parent_eol_type;
5194
5195       parent_spec
5196         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5197       parent_eol_type = AREF (parent_spec, 2);
5198       if (EQ (parent_eol_type, Qunix))
5199         coding_system = AREF (eol_type, 0);
5200       else if (EQ (parent_eol_type, Qdos))
5201         coding_system = AREF (eol_type, 1);
5202       else if (EQ (parent_eol_type, Qmac))
5203         coding_system = AREF (eol_type, 2);
5204     }
5205   return coding_system;
5206 }
5207
5208 /* Emacs has a mechanism to automatically detect a coding system if it
5209    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5210    it's impossible to distinguish some coding systems accurately
5211    because they use the same range of codes.  So, at first, coding
5212    systems are categorized into 7, those are:
5213
5214    o coding-category-emacs-mule
5215
5216         The category for a coding system which has the same code range
5217         as Emacs' internal format.  Assigned the coding-system (Lisp
5218         symbol) `emacs-mule' by default.
5219
5220    o coding-category-sjis
5221
5222         The category for a coding system which has the same code range
5223         as SJIS.  Assigned the coding-system (Lisp
5224         symbol) `japanese-shift-jis' by default.
5225
5226    o coding-category-iso-7
5227
5228         The category for a coding system which has the same code range
5229         as ISO2022 of 7-bit environment.  This doesn't use any locking
5230         shift and single shift functions.  This can encode/decode all
5231         charsets.  Assigned the coding-system (Lisp symbol)
5232         `iso-2022-7bit' by default.
5233
5234    o coding-category-iso-7-tight
5235
5236         Same as coding-category-iso-7 except that this can
5237         encode/decode only the specified charsets.
5238
5239    o coding-category-iso-8-1
5240
5241         The category for a coding system which has the same code range
5242         as ISO2022 of 8-bit environment and graphic plane 1 used only
5243         for DIMENSION1 charset.  This doesn't use any locking shift
5244         and single shift functions.  Assigned the coding-system (Lisp
5245         symbol) `iso-latin-1' by default.
5246
5247    o coding-category-iso-8-2
5248
5249         The category for a coding system which has the same code range
5250         as ISO2022 of 8-bit environment and graphic plane 1 used only
5251         for DIMENSION2 charset.  This doesn't use any locking shift
5252         and single shift functions.  Assigned the coding-system (Lisp
5253         symbol) `japanese-iso-8bit' by default.
5254
5255    o coding-category-iso-7-else
5256
5257         The category for a coding system which has the same code range
5258         as ISO2022 of 7-bit environemnt but uses locking shift or
5259         single shift functions.  Assigned the coding-system (Lisp
5260         symbol) `iso-2022-7bit-lock' by default.
5261
5262    o coding-category-iso-8-else
5263
5264         The category for a coding system which has the same code range
5265         as ISO2022 of 8-bit environemnt but uses locking shift or
5266         single shift functions.  Assigned the coding-system (Lisp
5267         symbol) `iso-2022-8bit-ss2' by default.
5268
5269    o coding-category-big5
5270
5271         The category for a coding system which has the same code range
5272         as BIG5.  Assigned the coding-system (Lisp symbol)
5273         `cn-big5' by default.
5274
5275    o coding-category-utf-8
5276
5277         The category for a coding system which has the same code range
5278         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5279         symbol) `utf-8' by default.
5280
5281    o coding-category-utf-16-be
5282
5283         The category for a coding system in which a text has an
5284         Unicode signature (cf. Unicode Standard) in the order of BIG
5285         endian at the head.  Assigned the coding-system (Lisp symbol)
5286         `utf-16-be' by default.
5287
5288    o coding-category-utf-16-le
5289
5290         The category for a coding system in which a text has an
5291         Unicode signature (cf. Unicode Standard) in the order of
5292         LITTLE endian at the head.  Assigned the coding-system (Lisp
5293         symbol) `utf-16-le' by default.
5294
5295    o coding-category-ccl
5296
5297         The category for a coding system of which encoder/decoder is
5298         written in CCL programs.  The default value is nil, i.e., no
5299         coding system is assigned.
5300
5301    o coding-category-binary
5302
5303         The category for a coding system not categorized in any of the
5304         above.  Assigned the coding-system (Lisp symbol)
5305         `no-conversion' by default.
5306
5307    Each of them is a Lisp symbol and the value is an actual
5308    `coding-system's (this is also a Lisp symbol) assigned by a user.
5309    What Emacs does actually is to detect a category of coding system.
5310    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5311    decide only one possible category, it selects a category of the
5312    highest priority.  Priorities of categories are also specified by a
5313    user in a Lisp variable `coding-category-list'.
5314
5315 */
5316
5317 #define EOL_SEEN_NONE   0
5318 #define EOL_SEEN_LF     1
5319 #define EOL_SEEN_CR     2
5320 #define EOL_SEEN_CRLF   4
5321
5322 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5323    SOURCE is encoded.  If CATEGORY is one of
5324    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5325    two-byte, else they are encoded by one-byte.
5326
5327    Return one of EOL_SEEN_XXX.  */
5328
5329 #define MAX_EOL_CHECK_COUNT 3
5330
5331 static int
5332 detect_eol (source, src_bytes, category)
5333      const unsigned char *source;
5334      EMACS_INT src_bytes;
5335      enum coding_category category;
5336 {
5337   const unsigned char *src = source, *src_end = src + src_bytes;
5338   unsigned char c;
5339   int total  = 0;
5340   int eol_seen = EOL_SEEN_NONE;
5341
5342   if ((1 << category) & CATEGORY_MASK_UTF_16)
5343     {
5344       int msb, lsb;
5345
5346       msb = category == (coding_category_utf_16_le
5347                          | coding_category_utf_16_le_nosig);
5348       lsb = 1 - msb;
5349
5350       while (src + 1 < src_end)
5351         {
5352           c = src[lsb];
5353           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5354             {
5355               int this_eol;
5356
5357               if (c == '\n')
5358                 this_eol = EOL_SEEN_LF;
5359               else if (src + 3 >= src_end
5360                        || src[msb + 2] != 0
5361                        || src[lsb + 2] != '\n')
5362                 this_eol = EOL_SEEN_CR;
5363               else
5364                 this_eol = EOL_SEEN_CRLF;
5365
5366               if (eol_seen == EOL_SEEN_NONE)
5367                 /* This is the first end-of-line.  */
5368                 eol_seen = this_eol;
5369               else if (eol_seen != this_eol)
5370                 {
5371                   /* The found type is different from what found before.  */
5372                   eol_seen = EOL_SEEN_LF;
5373                   break;
5374                 }
5375               if (++total == MAX_EOL_CHECK_COUNT)
5376                 break;
5377             }
5378           src += 2;
5379         }
5380     }
5381   else
5382     {
5383       while (src < src_end)
5384         {
5385           c = *src++;
5386           if (c == '\n' || c == '\r')
5387             {
5388               int this_eol;
5389
5390               if (c == '\n')
5391                 this_eol = EOL_SEEN_LF;
5392               else if (src >= src_end || *src != '\n')
5393                 this_eol = EOL_SEEN_CR;
5394               else
5395                 this_eol = EOL_SEEN_CRLF, src++;
5396
5397               if (eol_seen == EOL_SEEN_NONE)
5398                 /* This is the first end-of-line.  */
5399                 eol_seen = this_eol;
5400               else if (eol_seen != this_eol)
5401                 {
5402                   /* The found type is different from what found before.  */
5403                   eol_seen = EOL_SEEN_LF;
5404                   break;
5405                 }
5406               if (++total == MAX_EOL_CHECK_COUNT)
5407                 break;
5408             }
5409         }
5410     }
5411   return eol_seen;
5412 }
5413
5414
5415 static Lisp_Object
5416 adjust_coding_eol_type (coding, eol_seen)
5417      struct coding_system *coding;
5418      int eol_seen;
5419 {
5420   Lisp_Object eol_type;
5421
5422   eol_type = CODING_ID_EOL_TYPE (coding->id);
5423   if (eol_seen & EOL_SEEN_LF)
5424     {
5425       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5426       eol_type = Qunix;
5427     }
5428   else if (eol_seen & EOL_SEEN_CRLF)
5429     {
5430       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5431       eol_type = Qdos;
5432     }
5433   else if (eol_seen & EOL_SEEN_CR)
5434     {
5435       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5436       eol_type = Qmac;
5437     }
5438   return eol_type;
5439 }
5440
5441 /* Detect how a text specified in CODING is encoded.  If a coding
5442    system is detected, update fields of CODING by the detected coding
5443    system.  */
5444
5445 void
5446 detect_coding (coding)
5447      struct coding_system *coding;
5448 {
5449   const unsigned char *src, *src_end;
5450
5451   coding->consumed = coding->consumed_char = 0;
5452   coding->produced = coding->produced_char = 0;
5453   coding_set_source (coding);
5454
5455   src_end = coding->source + coding->src_bytes;
5456
5457   /* If we have not yet decided the text encoding type, detect it
5458      now.  */
5459   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5460     {
5461       int c, i;
5462       struct coding_detection_info detect_info;
5463
5464       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5465       for (i = 0, src = coding->source; src < src_end; i++, src++)
5466         {
5467           c = *src;
5468           if (c & 0x80)
5469             break;
5470           if (c < 0x20
5471               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5472               && ! inhibit_iso_escape_detection
5473               && ! detect_info.checked)
5474             {
5475               coding->head_ascii = src - (coding->source + coding->consumed);
5476               if (detect_coding_iso_2022 (coding, &detect_info))
5477                 {
5478                   /* We have scanned the whole data.  */
5479                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5480                     /* We didn't find an 8-bit code.  */
5481                     src = src_end;
5482                   break;
5483                 }
5484             }
5485         }
5486       coding->head_ascii = src - (coding->source + coding->consumed);
5487
5488       if (coding->head_ascii < coding->src_bytes
5489           || detect_info.found)
5490         {
5491           enum coding_category category;
5492           struct coding_system *this;
5493
5494           if (coding->head_ascii == coding->src_bytes)
5495             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5496             for (i = 0; i < coding_category_raw_text; i++)
5497               {
5498                 category = coding_priorities[i];
5499                 this = coding_categories + category;
5500                 if (detect_info.found & (1 << category))
5501                   break;
5502               }
5503           else
5504             for (i = 0; i < coding_category_raw_text; i++)
5505               {
5506                 category = coding_priorities[i];
5507                 this = coding_categories + category;
5508                 if (this->id < 0)
5509                   {
5510                     /* No coding system of this category is defined.  */
5511                     detect_info.rejected |= (1 << category);
5512                   }
5513                 else if (category >= coding_category_raw_text)
5514                   continue;
5515                 else if (detect_info.checked & (1 << category))
5516                   {
5517                     if (detect_info.found & (1 << category))
5518                       break;
5519                   }
5520                 else if ((*(this->detector)) (coding, &detect_info)
5521                          && detect_info.found & (1 << category))
5522                   {
5523                     if (category == coding_category_utf_16_auto)
5524                       {
5525                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5526                           category = coding_category_utf_16_le;
5527                         else
5528                           category = coding_category_utf_16_be;
5529                       }
5530                     break;
5531                   }
5532               }
5533
5534           if (i < coding_category_raw_text)
5535             setup_coding_system (CODING_ID_NAME (this->id), coding);
5536           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5537             setup_coding_system (Qraw_text, coding);
5538           else if (detect_info.rejected)
5539             for (i = 0; i < coding_category_raw_text; i++)
5540               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5541                 {
5542                   this = coding_categories + coding_priorities[i];
5543                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5544                   break;
5545                 }
5546         }
5547     }
5548   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5549            == coding_category_utf_16_auto)
5550     {
5551       Lisp_Object coding_systems;
5552       struct coding_detection_info detect_info;
5553
5554       coding_systems
5555         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5556       detect_info.found = detect_info.rejected = 0;
5557       if (CONSP (coding_systems)
5558           && detect_coding_utf_16 (coding, &detect_info))
5559         {
5560           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5561             setup_coding_system (XCAR (coding_systems), coding);
5562           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5563             setup_coding_system (XCDR (coding_systems), coding);
5564         }
5565     }
5566 }
5567
5568
5569 static void
5570 decode_eol (coding)
5571      struct coding_system *coding;
5572 {
5573   Lisp_Object eol_type;
5574   unsigned char *p, *pbeg, *pend;
5575
5576   eol_type = CODING_ID_EOL_TYPE (coding->id);
5577   if (EQ (eol_type, Qunix))
5578     return;
5579
5580   if (NILP (coding->dst_object))
5581     pbeg = coding->destination;
5582   else
5583     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5584   pend = pbeg + coding->produced;
5585
5586   if (VECTORP (eol_type))
5587     {
5588       int eol_seen = EOL_SEEN_NONE;
5589
5590       for (p = pbeg; p < pend; p++)
5591         {
5592           if (*p == '\n')
5593             eol_seen |= EOL_SEEN_LF;
5594           else if (*p == '\r')
5595             {
5596               if (p + 1 < pend && *(p + 1) == '\n')
5597                 {
5598                   eol_seen |= EOL_SEEN_CRLF;
5599                   p++;
5600                 }
5601               else
5602                 eol_seen |= EOL_SEEN_CR;
5603             }
5604         }
5605       if (eol_seen != EOL_SEEN_NONE
5606           && eol_seen != EOL_SEEN_LF
5607           && eol_seen != EOL_SEEN_CRLF
5608           && eol_seen != EOL_SEEN_CR)
5609         eol_seen = EOL_SEEN_LF;
5610       if (eol_seen != EOL_SEEN_NONE)
5611         eol_type = adjust_coding_eol_type (coding, eol_seen);
5612     }
5613
5614   if (EQ (eol_type, Qmac))
5615     {
5616       for (p = pbeg; p < pend; p++)
5617         if (*p == '\r')
5618           *p = '\n';
5619     }
5620   else if (EQ (eol_type, Qdos))
5621     {
5622       int n = 0;
5623
5624       if (NILP (coding->dst_object))
5625         {
5626           for (p = pend - 2; p >= pbeg; p--)
5627             if (*p == '\r')
5628               {
5629                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5630                 n++;
5631               }
5632         }
5633       else
5634         {
5635           for (p = pend - 2; p >= pbeg; p--)
5636             if (*p == '\r')
5637               {
5638                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5639                 int pos = BYTE_TO_CHAR (pos_byte);
5640
5641                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5642                 n++;
5643               }
5644         }
5645       coding->produced -= n;
5646       coding->produced_char -= n;
5647     }
5648 }
5649
5650
5651 /* Return a translation table (or list of them) from coding system
5652    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5653    decoding (ENCODEP is zero). */
5654
5655 static Lisp_Object
5656 get_translation_table (attrs, encodep, max_lookup)
5657      Lisp_Object attrs;
5658      int encodep, *max_lookup;
5659 {
5660   Lisp_Object standard, translation_table;
5661   Lisp_Object val;
5662
5663   if (encodep)
5664     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5665       standard = Vstandard_translation_table_for_encode;
5666   else
5667     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5668       standard = Vstandard_translation_table_for_decode;
5669   if (NILP (translation_table))
5670     translation_table = standard;
5671   else
5672     {
5673       if (SYMBOLP (translation_table))
5674         translation_table = Fget (translation_table, Qtranslation_table);
5675       else if (CONSP (translation_table))
5676         {
5677           translation_table = Fcopy_sequence (translation_table);
5678           for (val = translation_table; CONSP (val); val = XCDR (val))
5679             if (SYMBOLP (XCAR (val)))
5680               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5681         }
5682       if (CHAR_TABLE_P (standard))
5683         {
5684           if (CONSP (translation_table))
5685             translation_table = nconc2 (translation_table,
5686                                         Fcons (standard, Qnil));
5687           else
5688             translation_table = Fcons (translation_table,
5689                                        Fcons (standard, Qnil));
5690         }
5691     }
5692
5693   if (max_lookup)
5694     {
5695       *max_lookup = 1;
5696       if (CHAR_TABLE_P (translation_table)
5697           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5698         {
5699           val = XCHAR_TABLE (translation_table)->extras[1];
5700           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5701             *max_lookup = XFASTINT (val);
5702         }
5703       else if (CONSP (translation_table))
5704         {
5705           Lisp_Object tail, val;
5706
5707           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5708             if (CHAR_TABLE_P (XCAR (tail))
5709                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5710               {
5711                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5712                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5713                   *max_lookup = XFASTINT (val);
5714               }
5715         }
5716     }
5717   return translation_table;
5718 }
5719
5720 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5721   do {                                                          \
5722     trans = Qnil;                                               \
5723     if (CHAR_TABLE_P (table))                                   \
5724       {                                                         \
5725         trans = CHAR_TABLE_REF (table, c);                      \
5726         if (CHARACTERP (trans))                                 \
5727           c = XFASTINT (trans), trans = Qnil;                   \
5728       }                                                         \
5729     else if (CONSP (table))                                     \
5730       {                                                         \
5731         Lisp_Object tail;                                       \
5732                                                                 \
5733         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5734           if (CHAR_TABLE_P (XCAR (tail)))                       \
5735             {                                                   \
5736               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5737               if (CHARACTERP (trans))                           \
5738                 c = XFASTINT (trans), trans = Qnil;             \
5739               else if (! NILP (trans))                          \
5740                 break;                                          \
5741             }                                                   \
5742       }                                                         \
5743   } while (0)
5744
5745
5746 static Lisp_Object
5747 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5748      Lisp_Object val;
5749      int *buf, *buf_end;
5750      int last_block;
5751      int *from_nchars, *to_nchars;
5752 {
5753   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5754      [TO-CHAR ...].  */
5755   if (CONSP (val))
5756     {
5757       Lisp_Object from, tail;
5758       int i, len;
5759
5760       for (tail = val; CONSP (tail); tail = XCDR (tail))
5761         {
5762           val = XCAR (tail);
5763           from = XCAR (val);
5764           len = ASIZE (from);
5765           for (i = 0; i < len; i++)
5766             {
5767               if (buf + i == buf_end)
5768                 {
5769                   if (! last_block)
5770                     return Qt;
5771                   break;
5772                 }
5773               if (XINT (AREF (from, i)) != buf[i])
5774                 break;
5775             }
5776           if (i == len)
5777             {
5778               val = XCDR (val);
5779               *from_nchars = len;
5780               break;
5781             }
5782         }
5783       if (! CONSP (tail))
5784         return Qnil;
5785     }
5786   if (VECTORP (val))
5787     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5788   else
5789     *buf = XINT (val);
5790   return val;
5791 }
5792
5793
5794 static int
5795 produce_chars (coding, translation_table, last_block)
5796      struct coding_system *coding;
5797      Lisp_Object translation_table;
5798      int last_block;
5799 {
5800   unsigned char *dst = coding->destination + coding->produced;
5801   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5802   int produced;
5803   int produced_chars = 0;
5804   int carryover = 0;
5805
5806   if (! coding->chars_at_source)
5807     {
5808       /* Characters are in coding->charbuf.  */
5809       int *buf = coding->charbuf;
5810       int *buf_end = buf + coding->charbuf_used;
5811
5812       if (BUFFERP (coding->src_object)
5813           && EQ (coding->src_object, coding->dst_object))
5814         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5815
5816       while (buf < buf_end)
5817         {
5818           int c = *buf, i;
5819
5820           if (c >= 0)
5821             {
5822               int from_nchars = 1, to_nchars = 1;
5823               Lisp_Object trans = Qnil;
5824
5825               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5826               if (! NILP (trans))
5827                 {
5828                   trans = get_translation (trans, buf, buf_end, last_block,
5829                                            &from_nchars, &to_nchars);
5830                   if (EQ (trans, Qt))
5831                     break;
5832                   c = *buf;
5833                 }
5834
5835               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5836                 {
5837                   dst = alloc_destination (coding,
5838                                            buf_end - buf
5839                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5840                                            dst);
5841                   dst_end = coding->destination + coding->dst_bytes;
5842                 }
5843
5844               for (i = 0; i < to_nchars; i++)
5845                 {
5846                   if (i > 0)
5847                     c = XINT (AREF (trans, i));
5848                   if (coding->dst_multibyte
5849                       || ! CHAR_BYTE8_P (c))
5850                     CHAR_STRING_ADVANCE (c, dst);
5851                   else
5852                     *dst++ = CHAR_TO_BYTE8 (c);
5853                 }
5854               produced_chars += to_nchars;
5855               *buf++ = to_nchars;
5856               while (--from_nchars > 0)
5857                 *buf++ = 0;
5858             }
5859           else
5860             /* This is an annotation datum.  (-C) is the length.  */
5861             buf += -c;
5862         }
5863       carryover = buf_end - buf;
5864     }
5865   else
5866     {
5867       const unsigned char *src = coding->source;
5868       const unsigned char *src_end = src + coding->src_bytes;
5869       Lisp_Object eol_type;
5870
5871       eol_type = CODING_ID_EOL_TYPE (coding->id);
5872
5873       if (coding->src_multibyte != coding->dst_multibyte)
5874         {
5875           if (coding->src_multibyte)
5876             {
5877               int multibytep = 1;
5878               int consumed_chars;
5879
5880               while (1)
5881                 {
5882                   const unsigned char *src_base = src;
5883                   int c;
5884
5885                   ONE_MORE_BYTE (c);
5886                   if (c == '\r')
5887                     {
5888                       if (EQ (eol_type, Qdos))
5889                         {
5890                           if (src == src_end)
5891                             {
5892                               record_conversion_result
5893                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5894                               goto no_more_source;
5895                             }
5896                           if (*src == '\n')
5897                             c = *src++;
5898                         }
5899                       else if (EQ (eol_type, Qmac))
5900                         c = '\n';
5901                     }
5902                   if (dst == dst_end)
5903                     {
5904                       coding->consumed = src - coding->source;
5905
5906                     if (EQ (coding->src_object, coding->dst_object))
5907                       dst_end = (unsigned char *) src;
5908                     if (dst == dst_end)
5909                       {
5910                         dst = alloc_destination (coding, src_end - src + 1,
5911                                                  dst);
5912                         dst_end = coding->destination + coding->dst_bytes;
5913                         coding_set_source (coding);
5914                         src = coding->source + coding->consumed;
5915                         src_end = coding->source + coding->src_bytes;
5916                       }
5917                     }
5918                   *dst++ = c;
5919                   produced_chars++;
5920                 }
5921             no_more_source:
5922               ;
5923             }
5924           else
5925             while (src < src_end)
5926               {
5927                 int multibytep = 1;
5928                 int c = *src++;
5929
5930                 if (c == '\r')
5931                   {
5932                     if (EQ (eol_type, Qdos))
5933                       {
5934                         if (src < src_end
5935                             && *src == '\n')
5936                           c = *src++;
5937                       }
5938                     else if (EQ (eol_type, Qmac))
5939                       c = '\n';
5940                   }
5941                 if (dst >= dst_end - 1)
5942                   {
5943                     coding->consumed = src - coding->source;
5944
5945                     if (EQ (coding->src_object, coding->dst_object))
5946                       dst_end = (unsigned char *) src;
5947                     if (dst >= dst_end - 1)
5948                       {
5949                         dst = alloc_destination (coding, src_end - src + 2,
5950                                                  dst);
5951                         dst_end = coding->destination + coding->dst_bytes;
5952                         coding_set_source (coding);
5953                         src = coding->source + coding->consumed;
5954                         src_end = coding->source + coding->src_bytes;
5955                       }
5956                   }
5957                 EMIT_ONE_BYTE (c);
5958               }
5959         }
5960       else
5961         {
5962           if (!EQ (coding->src_object, coding->dst_object))
5963             {
5964               int require = coding->src_bytes - coding->dst_bytes;
5965
5966               if (require > 0)
5967                 {
5968                   EMACS_INT offset = src - coding->source;
5969
5970                   dst = alloc_destination (coding, require, dst);
5971                   coding_set_source (coding);
5972                   src = coding->source + offset;
5973                   src_end = coding->source + coding->src_bytes;
5974                 }
5975             }
5976           produced_chars = coding->src_chars;
5977           while (src < src_end)
5978             {
5979               int c = *src++;
5980
5981               if (c == '\r')
5982                 {
5983                   if (EQ (eol_type, Qdos))
5984                     {
5985                       if (src < src_end
5986                           && *src == '\n')
5987                         c = *src++;
5988                       produced_chars--;
5989                     }
5990                   else if (EQ (eol_type, Qmac))
5991                     c = '\n';
5992                 }
5993               *dst++ = c;
5994             }
5995         }
5996       coding->consumed = coding->src_bytes;
5997       coding->consumed_char = coding->src_chars;
5998     }
5999
6000   produced = dst - (coding->destination + coding->produced);
6001   if (BUFFERP (coding->dst_object))
6002     insert_from_gap (produced_chars, produced);
6003   coding->produced += produced;
6004   coding->produced_char += produced_chars;
6005   return carryover;
6006 }
6007
6008 /* Compose text in CODING->object according to the annotation data at
6009    CHARBUF.  CHARBUF is an array:
6010      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6011  */
6012
6013 static INLINE void
6014 produce_composition (coding, charbuf, pos)
6015      struct coding_system *coding;
6016      int *charbuf;
6017      EMACS_INT pos;
6018 {
6019   int len;
6020   EMACS_INT to;
6021   enum composition_method method;
6022   Lisp_Object components;
6023
6024   len = -charbuf[0];
6025   to = pos + charbuf[2];
6026   if (to <= pos)
6027     return;
6028   method = (enum composition_method) (charbuf[3]);
6029
6030   if (method == COMPOSITION_RELATIVE)
6031     components = Qnil;
6032   else if (method >= COMPOSITION_WITH_RULE
6033            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6034     {
6035       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6036       int i;
6037
6038       len -= 4;
6039       charbuf += 4;
6040       for (i = 0; i < len; i++)
6041         {
6042           args[i] = make_number (charbuf[i]);
6043           if (charbuf[i] < 0)
6044             return;
6045         }
6046       components = (method == COMPOSITION_WITH_ALTCHARS
6047                     ? Fstring (len, args) : Fvector (len, args));
6048     }
6049   else
6050     return;
6051   compose_text (pos, to, components, Qnil, coding->dst_object);
6052 }
6053
6054
6055 /* Put `charset' property on text in CODING->object according to
6056    the annotation data at CHARBUF.  CHARBUF is an array:
6057      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6058  */
6059
6060 static INLINE void
6061 produce_charset (coding, charbuf, pos)
6062      struct coding_system *coding;
6063      int *charbuf;
6064      EMACS_INT pos;
6065 {
6066   EMACS_INT from = pos - charbuf[2];
6067   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6068
6069   Fput_text_property (make_number (from), make_number (pos),
6070                       Qcharset, CHARSET_NAME (charset),
6071                       coding->dst_object);
6072 }
6073
6074
6075 #define CHARBUF_SIZE 0x4000
6076
6077 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6078   do {                                                                  \
6079     int size = CHARBUF_SIZE;;                                           \
6080                                                                         \
6081     coding->charbuf = NULL;                                             \
6082     while (size > 1024)                                                 \
6083       {                                                                 \
6084         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6085         if (coding->charbuf)                                            \
6086           break;                                                        \
6087         size >>= 1;                                                     \
6088       }                                                                 \
6089     if (! coding->charbuf)                                              \
6090       {                                                                 \
6091         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6092         return coding->result;                                          \
6093       }                                                                 \
6094     coding->charbuf_size = size;                                        \
6095   } while (0)
6096
6097
6098 static void
6099 produce_annotation (coding, pos)
6100      struct coding_system *coding;
6101      EMACS_INT pos;
6102 {
6103   int *charbuf = coding->charbuf;
6104   int *charbuf_end = charbuf + coding->charbuf_used;
6105
6106   if (NILP (coding->dst_object))
6107     return;
6108
6109   while (charbuf < charbuf_end)
6110     {
6111       if (*charbuf >= 0)
6112         pos += *charbuf++;
6113       else
6114         {
6115           int len = -*charbuf;
6116           switch (charbuf[1])
6117             {
6118             case CODING_ANNOTATE_COMPOSITION_MASK:
6119               produce_composition (coding, charbuf, pos);
6120               break;
6121             case CODING_ANNOTATE_CHARSET_MASK:
6122               produce_charset (coding, charbuf, pos);
6123               break;
6124             default:
6125               abort ();
6126             }
6127           charbuf += len;
6128         }
6129     }
6130 }
6131
6132 /* Decode the data at CODING->src_object into CODING->dst_object.
6133    CODING->src_object is a buffer, a string, or nil.
6134    CODING->dst_object is a buffer.
6135
6136    If CODING->src_object is a buffer, it must be the current buffer.
6137    In this case, if CODING->src_pos is positive, it is a position of
6138    the source text in the buffer, otherwise, the source text is in the
6139    gap area of the buffer, and CODING->src_pos specifies the offset of
6140    the text from GPT (which must be the same as PT).  If this is the
6141    same buffer as CODING->dst_object, CODING->src_pos must be
6142    negative.
6143
6144    If CODING->src_object is a string, CODING->src_pos in an index to
6145    that string.
6146
6147    If CODING->src_object is nil, CODING->source must already point to
6148    the non-relocatable memory area.  In this case, CODING->src_pos is
6149    an offset from CODING->source.
6150
6151    The decoded data is inserted at the current point of the buffer
6152    CODING->dst_object.
6153 */
6154
6155 static int
6156 decode_coding (coding)
6157      struct coding_system *coding;
6158 {
6159   Lisp_Object attrs;
6160   Lisp_Object undo_list;
6161   Lisp_Object translation_table;
6162   int carryover;
6163   int i;
6164
6165   if (BUFFERP (coding->src_object)
6166       && coding->src_pos > 0
6167       && coding->src_pos < GPT
6168       && coding->src_pos + coding->src_chars > GPT)
6169     move_gap_both (coding->src_pos, coding->src_pos_byte);
6170
6171   undo_list = Qt;
6172   if (BUFFERP (coding->dst_object))
6173     {
6174       if (current_buffer != XBUFFER (coding->dst_object))
6175         set_buffer_internal (XBUFFER (coding->dst_object));
6176       if (GPT != PT)
6177         move_gap_both (PT, PT_BYTE);
6178       undo_list = current_buffer->undo_list;
6179       current_buffer->undo_list = Qt;
6180     }
6181
6182   coding->consumed = coding->consumed_char = 0;
6183   coding->produced = coding->produced_char = 0;
6184   coding->chars_at_source = 0;
6185   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6186   coding->errors = 0;
6187
6188   ALLOC_CONVERSION_WORK_AREA (coding);
6189
6190   attrs = CODING_ID_ATTRS (coding->id);
6191   translation_table = get_translation_table (attrs, 0, NULL);
6192
6193   carryover = 0;
6194   do
6195     {
6196       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6197
6198       coding_set_source (coding);
6199       coding->annotated = 0;
6200       coding->charbuf_used = carryover;
6201       (*(coding->decoder)) (coding);
6202       coding_set_destination (coding);
6203       carryover = produce_chars (coding, translation_table, 0);
6204       if (coding->annotated)
6205         produce_annotation (coding, pos);
6206       for (i = 0; i < carryover; i++)
6207         coding->charbuf[i]
6208           = coding->charbuf[coding->charbuf_used - carryover + i];
6209     }
6210   while (coding->consumed < coding->src_bytes
6211          && (coding->result == CODING_RESULT_SUCCESS
6212              || coding->result == CODING_RESULT_INVALID_SRC));
6213
6214   if (carryover > 0)
6215     {
6216       coding_set_destination (coding);
6217       coding->charbuf_used = carryover;
6218       produce_chars (coding, translation_table, 1);
6219     }
6220
6221   coding->carryover_bytes = 0;
6222   if (coding->consumed < coding->src_bytes)
6223     {
6224       int nbytes = coding->src_bytes - coding->consumed;
6225       const unsigned char *src;
6226
6227       coding_set_source (coding);
6228       coding_set_destination (coding);
6229       src = coding->source + coding->consumed;
6230
6231       if (coding->mode & CODING_MODE_LAST_BLOCK)
6232         {
6233           /* Flush out unprocessed data as binary chars.  We are sure
6234              that the number of data is less than the size of
6235              coding->charbuf.  */
6236           coding->charbuf_used = 0;
6237           while (nbytes-- > 0)
6238             {
6239               int c = *src++;
6240
6241               if (c & 0x80)
6242                 c = BYTE8_TO_CHAR (c);
6243               coding->charbuf[coding->charbuf_used++] = c;
6244             }
6245           produce_chars (coding, Qnil, 1);
6246         }
6247       else
6248         {
6249           /* Record unprocessed bytes in coding->carryover.  We are
6250              sure that the number of data is less than the size of
6251              coding->carryover.  */
6252           unsigned char *p = coding->carryover;
6253
6254           coding->carryover_bytes = nbytes;
6255           while (nbytes-- > 0)
6256             *p++ = *src++;
6257         }
6258       coding->consumed = coding->src_bytes;
6259     }
6260
6261   if (BUFFERP (coding->dst_object))
6262     {
6263       current_buffer->undo_list = undo_list;
6264       record_insert (coding->dst_pos, coding->produced_char);
6265     }
6266   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6267     decode_eol (coding);
6268   return coding->result;
6269 }
6270
6271
6272 /* Extract an annotation datum from a composition starting at POS and
6273    ending before LIMIT of CODING->src_object (buffer or string), store
6274    the data in BUF, set *STOP to a starting position of the next
6275    composition (if any) or to LIMIT, and return the address of the
6276    next element of BUF.
6277
6278    If such an annotation is not found, set *STOP to a starting
6279    position of a composition after POS (if any) or to LIMIT, and
6280    return BUF.  */
6281
6282 static INLINE int *
6283 handle_composition_annotation (pos, limit, coding, buf, stop)
6284      EMACS_INT pos, limit;
6285      struct coding_system *coding;
6286      int *buf;
6287      EMACS_INT *stop;
6288 {
6289   EMACS_INT start, end;
6290   Lisp_Object prop;
6291
6292   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6293       || end > limit)
6294     *stop = limit;
6295   else if (start > pos)
6296     *stop = start;
6297   else
6298     {
6299       if (start == pos)
6300         {
6301           /* We found a composition.  Store the corresponding
6302              annotation data in BUF.  */
6303           int *head = buf;
6304           enum composition_method method = COMPOSITION_METHOD (prop);
6305           int nchars = COMPOSITION_LENGTH (prop);
6306
6307           ADD_COMPOSITION_DATA (buf, nchars, method);
6308           if (method != COMPOSITION_RELATIVE)
6309             {
6310               Lisp_Object components;
6311               int len, i, i_byte;
6312
6313               components = COMPOSITION_COMPONENTS (prop);
6314               if (VECTORP (components))
6315                 {
6316                   len = XVECTOR (components)->size;
6317                   for (i = 0; i < len; i++)
6318                     *buf++ = XINT (AREF (components, i));
6319                 }
6320               else if (STRINGP (components))
6321                 {
6322                   len = SCHARS (components);
6323                   i = i_byte = 0;
6324                   while (i < len)
6325                     {
6326                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6327                       buf++;
6328                     }
6329                 }
6330               else if (INTEGERP (components))
6331                 {
6332                   len = 1;
6333                   *buf++ = XINT (components);
6334                 }
6335               else if (CONSP (components))
6336                 {
6337                   for (len = 0; CONSP (components);
6338                        len++, components = XCDR (components))
6339                     *buf++ = XINT (XCAR (components));
6340                 }
6341               else
6342                 abort ();
6343               *head -= len;
6344             }
6345         }
6346
6347       if (find_composition (end, limit, &start, &end, &prop,
6348                             coding->src_object)
6349           && end <= limit)
6350         *stop = start;
6351       else
6352         *stop = limit;
6353     }
6354   return buf;
6355 }
6356
6357
6358 /* Extract an annotation datum from a text property `charset' at POS of
6359    CODING->src_object (buffer of string), store the data in BUF, set
6360    *STOP to the position where the value of `charset' property changes
6361    (limiting by LIMIT), and return the address of the next element of
6362    BUF.
6363
6364    If the property value is nil, set *STOP to the position where the
6365    property value is non-nil (limiting by LIMIT), and return BUF.  */
6366
6367 static INLINE int *
6368 handle_charset_annotation (pos, limit, coding, buf, stop)
6369      EMACS_INT pos, limit;
6370      struct coding_system *coding;
6371      int *buf;
6372      EMACS_INT *stop;
6373 {
6374   Lisp_Object val, next;
6375   int id;
6376
6377   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6378   if (! NILP (val) && CHARSETP (val))
6379     id = XINT (CHARSET_SYMBOL_ID (val));
6380   else
6381     id = -1;
6382   ADD_CHARSET_DATA (buf, 0, id);
6383   next = Fnext_single_property_change (make_number (pos), Qcharset,
6384                                        coding->src_object,
6385                                        make_number (limit));
6386   *stop = XINT (next);
6387   return buf;
6388 }
6389
6390
6391 static void
6392 consume_chars (coding, translation_table, max_lookup)
6393      struct coding_system *coding;
6394      Lisp_Object translation_table;
6395      int max_lookup;
6396 {
6397   int *buf = coding->charbuf;
6398   int *buf_end = coding->charbuf + coding->charbuf_size;
6399   const unsigned char *src = coding->source + coding->consumed;
6400   const unsigned char *src_end = coding->source + coding->src_bytes;
6401   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6402   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6403   int multibytep = coding->src_multibyte;
6404   Lisp_Object eol_type;
6405   int c;
6406   EMACS_INT stop, stop_composition, stop_charset;
6407   int *lookup_buf = NULL;
6408
6409   if (! NILP (translation_table))
6410     lookup_buf = alloca (sizeof (int) * max_lookup);
6411
6412   eol_type = CODING_ID_EOL_TYPE (coding->id);
6413   if (VECTORP (eol_type))
6414     eol_type = Qunix;
6415
6416   /* Note: composition handling is not yet implemented.  */
6417   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6418
6419   if (NILP (coding->src_object))
6420     stop = stop_composition = stop_charset = end_pos;
6421   else
6422     {
6423       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6424         stop = stop_composition = pos;
6425       else
6426         stop = stop_composition = end_pos;
6427       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6428         stop = stop_charset = pos;
6429       else
6430         stop_charset = end_pos;
6431     }
6432
6433   /* Compensate for CRLF and conversion.  */
6434   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6435   while (buf < buf_end)
6436     {
6437       Lisp_Object trans;
6438
6439       if (pos == stop)
6440         {
6441           if (pos == end_pos)
6442             break;
6443           if (pos == stop_composition)
6444             buf = handle_composition_annotation (pos, end_pos, coding,
6445                                                  buf, &stop_composition);
6446           if (pos == stop_charset)
6447             buf = handle_charset_annotation (pos, end_pos, coding,
6448                                              buf, &stop_charset);
6449           stop = (stop_composition < stop_charset
6450                   ? stop_composition : stop_charset);
6451         }
6452
6453       if (! multibytep)
6454         {
6455           EMACS_INT bytes;
6456
6457           if (coding->encoder == encode_coding_raw_text)
6458             c = *src++, pos++;
6459           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6460             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6461           else
6462             c = BYTE8_TO_CHAR (*src), src++, pos++;
6463         }
6464       else
6465         c = STRING_CHAR_ADVANCE (src), pos++;
6466       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6467         c = '\n';
6468       if (! EQ (eol_type, Qunix))
6469         {
6470           if (c == '\n')
6471             {
6472               if (EQ (eol_type, Qdos))
6473                 *buf++ = '\r';
6474               else
6475                 c = '\r';
6476             }
6477         }
6478
6479       trans = Qnil;
6480       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6481       if (NILP (trans))
6482         *buf++ = c;
6483       else
6484         {
6485           int from_nchars = 1, to_nchars = 1;
6486           int *lookup_buf_end;
6487           const unsigned char *p = src;
6488           int i;
6489
6490           lookup_buf[0] = c;
6491           for (i = 1; i < max_lookup && p < src_end; i++)
6492             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6493           lookup_buf_end = lookup_buf + i;
6494           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6495                                    &from_nchars, &to_nchars);
6496           if (EQ (trans, Qt)
6497               || buf + to_nchars > buf_end)
6498             break;
6499           *buf++ = *lookup_buf;
6500           for (i = 1; i < to_nchars; i++)
6501             *buf++ = XINT (AREF (trans, i));
6502           for (i = 1; i < from_nchars; i++, pos++)
6503             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6504         }
6505     }
6506
6507   coding->consumed = src - coding->source;
6508   coding->consumed_char = pos - coding->src_pos;
6509   coding->charbuf_used = buf - coding->charbuf;
6510   coding->chars_at_source = 0;
6511 }
6512
6513
6514 /* Encode the text at CODING->src_object into CODING->dst_object.
6515    CODING->src_object is a buffer or a string.
6516    CODING->dst_object is a buffer or nil.
6517
6518    If CODING->src_object is a buffer, it must be the current buffer.
6519    In this case, if CODING->src_pos is positive, it is a position of
6520    the source text in the buffer, otherwise. the source text is in the
6521    gap area of the buffer, and coding->src_pos specifies the offset of
6522    the text from GPT (which must be the same as PT).  If this is the
6523    same buffer as CODING->dst_object, CODING->src_pos must be
6524    negative and CODING should not have `pre-write-conversion'.
6525
6526    If CODING->src_object is a string, CODING should not have
6527    `pre-write-conversion'.
6528
6529    If CODING->dst_object is a buffer, the encoded data is inserted at
6530    the current point of that buffer.
6531
6532    If CODING->dst_object is nil, the encoded data is placed at the
6533    memory area specified by CODING->destination.  */
6534
6535 static int
6536 encode_coding (coding)
6537      struct coding_system *coding;
6538 {
6539   Lisp_Object attrs;
6540   Lisp_Object translation_table;
6541   int max_lookup;
6542
6543   attrs = CODING_ID_ATTRS (coding->id);
6544   if (coding->encoder == encode_coding_raw_text)
6545     translation_table = Qnil, max_lookup = 0;
6546   else
6547     translation_table = get_translation_table (attrs, 1, &max_lookup);
6548
6549   if (BUFFERP (coding->dst_object))
6550     {
6551       set_buffer_internal (XBUFFER (coding->dst_object));
6552       coding->dst_multibyte
6553         = ! NILP (current_buffer->enable_multibyte_characters);
6554     }
6555
6556   coding->consumed = coding->consumed_char = 0;
6557   coding->produced = coding->produced_char = 0;
6558   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6559   coding->errors = 0;
6560
6561   ALLOC_CONVERSION_WORK_AREA (coding);
6562
6563   do {
6564     coding_set_source (coding);
6565     consume_chars (coding, translation_table, max_lookup);
6566     coding_set_destination (coding);
6567     (*(coding->encoder)) (coding);
6568   } while (coding->consumed_char < coding->src_chars);
6569
6570   if (BUFFERP (coding->dst_object))
6571     insert_from_gap (coding->produced_char, coding->produced);
6572
6573   return (coding->result);
6574 }
6575
6576
6577 /* Name (or base name) of work buffer for code conversion.  */
6578 static Lisp_Object Vcode_conversion_workbuf_name;
6579
6580 /* A working buffer used by the top level conversion.  Once it is
6581    created, it is never destroyed.  It has the name
6582    Vcode_conversion_workbuf_name.  The other working buffers are
6583    destroyed after the use is finished, and their names are modified
6584    versions of Vcode_conversion_workbuf_name.  */
6585 static Lisp_Object Vcode_conversion_reused_workbuf;
6586
6587 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6588 static int reused_workbuf_in_use;
6589
6590
6591 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6592    multibyteness of returning buffer.  */
6593
6594 static Lisp_Object
6595 make_conversion_work_buffer (multibyte)
6596      int multibyte;
6597 {
6598   Lisp_Object name, workbuf;
6599   struct buffer *current;
6600
6601   if (reused_workbuf_in_use++)
6602     {
6603       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6604       workbuf = Fget_buffer_create (name);
6605     }
6606   else
6607     {
6608       name = Vcode_conversion_workbuf_name;
6609       workbuf = Fget_buffer_create (name);
6610       if (NILP (Vcode_conversion_reused_workbuf))
6611         Vcode_conversion_reused_workbuf = workbuf;
6612     }
6613   current = current_buffer;
6614   set_buffer_internal (XBUFFER (workbuf));
6615   Ferase_buffer ();
6616   current_buffer->undo_list = Qt;
6617   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6618   set_buffer_internal (current);
6619   return workbuf;
6620 }
6621
6622
6623 static Lisp_Object
6624 code_conversion_restore (arg)
6625      Lisp_Object arg;
6626 {
6627   Lisp_Object current, workbuf;
6628   struct gcpro gcpro1;
6629
6630   GCPRO1 (arg);
6631   current = XCAR (arg);
6632   workbuf = XCDR (arg);
6633   if (! NILP (workbuf))
6634     {
6635       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6636         reused_workbuf_in_use = 0;
6637       else if (! NILP (Fbuffer_live_p (workbuf)))
6638         Fkill_buffer (workbuf);
6639     }
6640   set_buffer_internal (XBUFFER (current));
6641   UNGCPRO;
6642   return Qnil;
6643 }
6644
6645 Lisp_Object
6646 code_conversion_save (with_work_buf, multibyte)
6647      int with_work_buf, multibyte;
6648 {
6649   Lisp_Object workbuf = Qnil;
6650
6651   if (with_work_buf)
6652     workbuf = make_conversion_work_buffer (multibyte);
6653   record_unwind_protect (code_conversion_restore,
6654                          Fcons (Fcurrent_buffer (), workbuf));
6655   return workbuf;
6656 }
6657
6658 int
6659 decode_coding_gap (coding, chars, bytes)
6660      struct coding_system *coding;
6661      EMACS_INT chars, bytes;
6662 {
6663   int count = specpdl_ptr - specpdl;
6664   Lisp_Object attrs;
6665
6666   code_conversion_save (0, 0);
6667
6668   coding->src_object = Fcurrent_buffer ();
6669   coding->src_chars = chars;
6670   coding->src_bytes = bytes;
6671   coding->src_pos = -chars;
6672   coding->src_pos_byte = -bytes;
6673   coding->src_multibyte = chars < bytes;
6674   coding->dst_object = coding->src_object;
6675   coding->dst_pos = PT;
6676   coding->dst_pos_byte = PT_BYTE;
6677   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6678
6679   if (CODING_REQUIRE_DETECTION (coding))
6680     detect_coding (coding);
6681
6682   coding->mode |= CODING_MODE_LAST_BLOCK;
6683   decode_coding (coding);
6684
6685   attrs = CODING_ID_ATTRS (coding->id);
6686   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6687     {
6688       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6689       Lisp_Object val;
6690
6691       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6692       val = call1 (CODING_ATTR_POST_READ (attrs),
6693                    make_number (coding->produced_char));
6694       CHECK_NATNUM (val);
6695       coding->produced_char += Z - prev_Z;
6696       coding->produced += Z_BYTE - prev_Z_BYTE;
6697     }
6698
6699   unbind_to (count, Qnil);
6700   return coding->result;
6701 }
6702
6703 int
6704 encode_coding_gap (coding, chars, bytes)
6705      struct coding_system *coding;
6706      EMACS_INT chars, bytes;
6707 {
6708   int count = specpdl_ptr - specpdl;
6709
6710   code_conversion_save (0, 0);
6711
6712   coding->src_object = Fcurrent_buffer ();
6713   coding->src_chars = chars;
6714   coding->src_bytes = bytes;
6715   coding->src_pos = -chars;
6716   coding->src_pos_byte = -bytes;
6717   coding->src_multibyte = chars < bytes;
6718   coding->dst_object = coding->src_object;
6719   coding->dst_pos = PT;
6720   coding->dst_pos_byte = PT_BYTE;
6721
6722   encode_coding (coding);
6723
6724   unbind_to (count, Qnil);
6725   return coding->result;
6726 }
6727
6728
6729 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6730    SRC_OBJECT into DST_OBJECT by coding context CODING.
6731
6732    SRC_OBJECT is a buffer, a string, or Qnil.
6733
6734    If it is a buffer, the text is at point of the buffer.  FROM and TO
6735    are positions in the buffer.
6736
6737    If it is a string, the text is at the beginning of the string.
6738    FROM and TO are indices to the string.
6739
6740    If it is nil, the text is at coding->source.  FROM and TO are
6741    indices to coding->source.
6742
6743    DST_OBJECT is a buffer, Qt, or Qnil.
6744
6745    If it is a buffer, the decoded text is inserted at point of the
6746    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6747    is deleted.
6748
6749    If it is Qt, a string is made from the decoded text, and
6750    set in CODING->dst_object.
6751
6752    If it is Qnil, the decoded text is stored at CODING->destination.
6753    The caller must allocate CODING->dst_bytes bytes at
6754    CODING->destination by xmalloc.  If the decoded text is longer than
6755    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6756  */
6757
6758 void
6759 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6760                       dst_object)
6761      struct coding_system *coding;
6762      Lisp_Object src_object;
6763      EMACS_INT from, from_byte, to, to_byte;
6764      Lisp_Object dst_object;
6765 {
6766   int count = specpdl_ptr - specpdl;
6767   unsigned char *destination;
6768   EMACS_INT dst_bytes;
6769   EMACS_INT chars = to - from;
6770   EMACS_INT bytes = to_byte - from_byte;
6771   Lisp_Object attrs;
6772   Lisp_Object buffer;
6773   int saved_pt = -1, saved_pt_byte;
6774
6775   buffer = Fcurrent_buffer ();
6776
6777   if (NILP (dst_object))
6778     {
6779       destination = coding->destination;
6780       dst_bytes = coding->dst_bytes;
6781     }
6782
6783   coding->src_object = src_object;
6784   coding->src_chars = chars;
6785   coding->src_bytes = bytes;
6786   coding->src_multibyte = chars < bytes;
6787
6788   if (STRINGP (src_object))
6789     {
6790       coding->src_pos = from;
6791       coding->src_pos_byte = from_byte;
6792     }
6793   else if (BUFFERP (src_object))
6794     {
6795       set_buffer_internal (XBUFFER (src_object));
6796       if (from != GPT)
6797         move_gap_both (from, from_byte);
6798       if (EQ (src_object, dst_object))
6799         {
6800           saved_pt = PT, saved_pt_byte = PT_BYTE;
6801           TEMP_SET_PT_BOTH (from, from_byte);
6802           del_range_both (from, from_byte, to, to_byte, 1);
6803           coding->src_pos = -chars;
6804           coding->src_pos_byte = -bytes;
6805         }
6806       else
6807         {
6808           coding->src_pos = from;
6809           coding->src_pos_byte = from_byte;
6810         }
6811     }
6812
6813   if (CODING_REQUIRE_DETECTION (coding))
6814     detect_coding (coding);
6815   attrs = CODING_ID_ATTRS (coding->id);
6816
6817   if (EQ (dst_object, Qt)
6818       || (! NILP (CODING_ATTR_POST_READ (attrs))
6819           && NILP (dst_object)))
6820     {
6821       coding->dst_object = code_conversion_save (1, 1);
6822       coding->dst_pos = BEG;
6823       coding->dst_pos_byte = BEG_BYTE;
6824       coding->dst_multibyte = 1;
6825     }
6826   else if (BUFFERP (dst_object))
6827     {
6828       code_conversion_save (0, 0);
6829       coding->dst_object = dst_object;
6830       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6831       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6832       coding->dst_multibyte
6833         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6834     }
6835   else
6836     {
6837       code_conversion_save (0, 0);
6838       coding->dst_object = Qnil;
6839       coding->dst_multibyte = 1;
6840     }
6841
6842   decode_coding (coding);
6843
6844   if (BUFFERP (coding->dst_object))
6845     set_buffer_internal (XBUFFER (coding->dst_object));
6846
6847   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6848     {
6849       struct gcpro gcpro1, gcpro2;
6850       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6851       Lisp_Object val;
6852
6853       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6854       GCPRO2 (coding->src_object, coding->dst_object);
6855       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6856                         make_number (coding->produced_char));
6857       UNGCPRO;
6858       CHECK_NATNUM (val);
6859       coding->produced_char += Z - prev_Z;
6860       coding->produced += Z_BYTE - prev_Z_BYTE;
6861     }
6862
6863   if (EQ (dst_object, Qt))
6864     {
6865       coding->dst_object = Fbuffer_string ();
6866     }
6867   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6868     {
6869       set_buffer_internal (XBUFFER (coding->dst_object));
6870       if (dst_bytes < coding->produced)
6871         {
6872           destination
6873             = (unsigned char *) xrealloc (destination, coding->produced);
6874           if (! destination)
6875             {
6876               record_conversion_result (coding,
6877                                         CODING_RESULT_INSUFFICIENT_DST);
6878               unbind_to (count, Qnil);
6879               return;
6880             }
6881           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6882             move_gap_both (BEGV, BEGV_BYTE);
6883           bcopy (BEGV_ADDR, destination, coding->produced);
6884           coding->destination = destination;
6885         }
6886     }
6887
6888   if (saved_pt >= 0)
6889     {
6890       /* This is the case of:
6891          (BUFFERP (src_object) && EQ (src_object, dst_object))
6892          As we have moved PT while replacing the original buffer
6893          contents, we must recover it now.  */
6894       set_buffer_internal (XBUFFER (src_object));
6895       if (saved_pt < from)
6896         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6897       else if (saved_pt < from + chars)
6898         TEMP_SET_PT_BOTH (from, from_byte);
6899       else if (! NILP (current_buffer->enable_multibyte_characters))
6900         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6901                           saved_pt_byte + (coding->produced - bytes));
6902       else
6903         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6904                           saved_pt_byte + (coding->produced - bytes));
6905     }
6906
6907   unbind_to (count, coding->dst_object);
6908 }
6909
6910
6911 void
6912 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6913                       dst_object)
6914      struct coding_system *coding;
6915      Lisp_Object src_object;
6916      EMACS_INT from, from_byte, to, to_byte;
6917      Lisp_Object dst_object;
6918 {
6919   int count = specpdl_ptr - specpdl;
6920   EMACS_INT chars = to - from;
6921   EMACS_INT bytes = to_byte - from_byte;
6922   Lisp_Object attrs;
6923   Lisp_Object buffer;
6924   int saved_pt = -1, saved_pt_byte;
6925   int kill_src_buffer = 0;
6926
6927   buffer = Fcurrent_buffer ();
6928
6929   coding->src_object = src_object;
6930   coding->src_chars = chars;
6931   coding->src_bytes = bytes;
6932   coding->src_multibyte = chars < bytes;
6933
6934   attrs = CODING_ID_ATTRS (coding->id);
6935
6936   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6937     {
6938       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6939       set_buffer_internal (XBUFFER (coding->src_object));
6940       if (STRINGP (src_object))
6941         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6942       else if (BUFFERP (src_object))
6943         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6944       else
6945         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6946
6947       if (EQ (src_object, dst_object))
6948         {
6949           set_buffer_internal (XBUFFER (src_object));
6950           saved_pt = PT, saved_pt_byte = PT_BYTE;
6951           del_range_both (from, from_byte, to, to_byte, 1);
6952           set_buffer_internal (XBUFFER (coding->src_object));
6953         }
6954
6955       {
6956         Lisp_Object args[3];
6957
6958         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6959         args[1] = make_number (BEG);
6960         args[2] = make_number (Z);
6961         safe_call (3, args);
6962       }
6963       if (XBUFFER (coding->src_object) != current_buffer)
6964         kill_src_buffer = 1;
6965       coding->src_object = Fcurrent_buffer ();
6966       if (BEG != GPT)
6967         move_gap_both (BEG, BEG_BYTE);
6968       coding->src_chars = Z - BEG;
6969       coding->src_bytes = Z_BYTE - BEG_BYTE;
6970       coding->src_pos = BEG;
6971       coding->src_pos_byte = BEG_BYTE;
6972       coding->src_multibyte = Z < Z_BYTE;
6973     }
6974   else if (STRINGP (src_object))
6975     {
6976       code_conversion_save (0, 0);
6977       coding->src_pos = from;
6978       coding->src_pos_byte = from_byte;
6979     }
6980   else if (BUFFERP (src_object))
6981     {
6982       code_conversion_save (0, 0);
6983       set_buffer_internal (XBUFFER (src_object));
6984       if (EQ (src_object, dst_object))
6985         {
6986           saved_pt = PT, saved_pt_byte = PT_BYTE;
6987           coding->src_object = del_range_1 (from, to, 1, 1);
6988           coding->src_pos = 0;
6989           coding->src_pos_byte = 0;
6990         }
6991       else
6992         {
6993           if (from < GPT && to >= GPT)
6994             move_gap_both (from, from_byte);
6995           coding->src_pos = from;
6996           coding->src_pos_byte = from_byte;
6997         }
6998     }
6999   else
7000     code_conversion_save (0, 0);
7001
7002   if (BUFFERP (dst_object))
7003     {
7004       coding->dst_object = dst_object;
7005       if (EQ (src_object, dst_object))
7006         {
7007           coding->dst_pos = from;
7008           coding->dst_pos_byte = from_byte;
7009         }
7010       else
7011         {
7012           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7013           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7014         }
7015       coding->dst_multibyte
7016         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7017     }
7018   else if (EQ (dst_object, Qt))
7019     {
7020       coding->dst_object = Qnil;
7021       coding->dst_bytes = coding->src_chars;
7022       if (coding->dst_bytes == 0)
7023         coding->dst_bytes = 1;
7024       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7025       coding->dst_multibyte = 0;
7026     }
7027   else
7028     {
7029       coding->dst_object = Qnil;
7030       coding->dst_multibyte = 0;
7031     }
7032
7033   encode_coding (coding);
7034
7035   if (EQ (dst_object, Qt))
7036     {
7037       if (BUFFERP (coding->dst_object))
7038         coding->dst_object = Fbuffer_string ();
7039       else
7040         {
7041           coding->dst_object
7042             = make_unibyte_string ((char *) coding->destination,
7043                                    coding->produced);
7044           xfree (coding->destination);
7045         }
7046     }
7047
7048   if (saved_pt >= 0)
7049     {
7050       /* This is the case of:
7051          (BUFFERP (src_object) && EQ (src_object, dst_object))
7052          As we have moved PT while replacing the original buffer
7053          contents, we must recover it now.  */
7054       set_buffer_internal (XBUFFER (src_object));
7055       if (saved_pt < from)
7056         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7057       else if (saved_pt < from + chars)
7058         TEMP_SET_PT_BOTH (from, from_byte);
7059       else if (! NILP (current_buffer->enable_multibyte_characters))
7060         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7061                           saved_pt_byte + (coding->produced - bytes));
7062       else
7063         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7064                           saved_pt_byte + (coding->produced - bytes));
7065     }
7066
7067   if (kill_src_buffer)
7068     Fkill_buffer (coding->src_object);
7069   unbind_to (count, Qnil);
7070 }
7071
7072
7073 Lisp_Object
7074 preferred_coding_system ()
7075 {
7076   int id = coding_categories[coding_priorities[0]].id;
7077
7078   return CODING_ID_NAME (id);
7079 }
7080
7081 \f
7082 #ifdef emacs
7083 /*** 8. Emacs Lisp library functions ***/
7084
7085 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7086        doc: /* Return t if OBJECT is nil or a coding-system.
7087 See the documentation of `define-coding-system' for information
7088 about coding-system objects.  */)
7089      (obj)
7090      Lisp_Object obj;
7091 {
7092   if (NILP (obj)
7093       || CODING_SYSTEM_ID (obj) >= 0)
7094     return Qt;
7095   if (! SYMBOLP (obj)
7096       || NILP (Fget (obj, Qcoding_system_define_form)))
7097     return Qnil;
7098   return Qt;
7099 }
7100
7101 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7102        Sread_non_nil_coding_system, 1, 1, 0,
7103        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7104      (prompt)
7105      Lisp_Object prompt;
7106 {
7107   Lisp_Object val;
7108   do
7109     {
7110       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7111                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7112     }
7113   while (SCHARS (val) == 0);
7114   return (Fintern (val, Qnil));
7115 }
7116
7117 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7118        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7119 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7120      (prompt, default_coding_system)
7121      Lisp_Object prompt, default_coding_system;
7122 {
7123   Lisp_Object val;
7124   if (SYMBOLP (default_coding_system))
7125     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7126   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7127                           Qt, Qnil, Qcoding_system_history,
7128                           default_coding_system, Qnil);
7129   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7130 }
7131
7132 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7133        1, 1, 0,
7134        doc: /* Check validity of CODING-SYSTEM.
7135 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7136 It is valid if it is nil or a symbol defined as a coding system by the
7137 function `define-coding-system'.  */)
7138   (coding_system)
7139      Lisp_Object coding_system;
7140 {
7141   Lisp_Object define_form;
7142
7143   define_form = Fget (coding_system, Qcoding_system_define_form);
7144   if (! NILP (define_form))
7145     {
7146       Fput (coding_system, Qcoding_system_define_form, Qnil);
7147       safe_eval (define_form);
7148     }
7149   if (!NILP (Fcoding_system_p (coding_system)))
7150     return coding_system;
7151   while (1)
7152     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7153 }
7154
7155 \f
7156 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7157    HIGHEST is nonzero, return the coding system of the highest
7158    priority among the detected coding systems.  Otherwize return a
7159    list of detected coding systems sorted by their priorities.  If
7160    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7161    multibyte form but contains only ASCII and eight-bit chars.
7162    Otherwise, the bytes are raw bytes.
7163
7164    CODING-SYSTEM controls the detection as below:
7165
7166    If it is nil, detect both text-format and eol-format.  If the
7167    text-format part of CODING-SYSTEM is already specified
7168    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7169    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7170    detect only text-format.  */
7171
7172 Lisp_Object
7173 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7174                       coding_system)
7175      const unsigned char *src;
7176      int src_chars, src_bytes, highest;
7177      int multibytep;
7178      Lisp_Object coding_system;
7179 {
7180   const unsigned char *src_end = src + src_bytes;
7181   Lisp_Object attrs, eol_type;
7182   Lisp_Object val;
7183   struct coding_system coding;
7184   int id;
7185   struct coding_detection_info detect_info;
7186   enum coding_category base_category;
7187
7188   if (NILP (coding_system))
7189     coding_system = Qundecided;
7190   setup_coding_system (coding_system, &coding);
7191   attrs = CODING_ID_ATTRS (coding.id);
7192   eol_type = CODING_ID_EOL_TYPE (coding.id);
7193   coding_system = CODING_ATTR_BASE_NAME (attrs);
7194
7195   coding.source = src;
7196   coding.src_chars = src_chars;
7197   coding.src_bytes = src_bytes;
7198   coding.src_multibyte = multibytep;
7199   coding.consumed = 0;
7200   coding.mode |= CODING_MODE_LAST_BLOCK;
7201
7202   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7203
7204   /* At first, detect text-format if necessary.  */
7205   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7206   if (base_category == coding_category_undecided)
7207     {
7208       enum coding_category category;
7209       struct coding_system *this;
7210       int c, i;
7211
7212       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7213       for (i = 0; src < src_end; i++, src++)
7214         {
7215           c = *src;
7216           if (c & 0x80)
7217             break;
7218           if (c < 0x20
7219               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7220               && inhibit_iso_escape_detection)
7221             {
7222               coding.head_ascii = src - coding.source;
7223               if (detect_coding_iso_2022 (&coding, &detect_info))
7224                 {
7225                   /* We have scanned the whole data.  */
7226                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7227                     /* We didn't find an 8-bit code.  */
7228                     src = src_end;
7229                   break;
7230                 }
7231             }
7232         }
7233       coding.head_ascii = src - coding.source;
7234
7235       if (src < src_end
7236           || detect_info.found)
7237         {
7238           if (src == src_end)
7239             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7240             for (i = 0; i < coding_category_raw_text; i++)
7241               {
7242                 category = coding_priorities[i];
7243                 if (detect_info.found & (1 << category))
7244                   break;
7245               }
7246           else
7247             for (i = 0; i < coding_category_raw_text; i++)
7248               {
7249                 category = coding_priorities[i];
7250                 this = coding_categories + category;
7251
7252                 if (this->id < 0)
7253                   {
7254                     /* No coding system of this category is defined.  */
7255                     detect_info.rejected |= (1 << category);
7256                   }
7257                 else if (category >= coding_category_raw_text)
7258                   continue;
7259                 else if (detect_info.checked & (1 << category))
7260                   {
7261                     if (highest
7262                         && (detect_info.found & (1 << category)))
7263                       break;
7264                   }
7265                 else
7266                   {
7267                     if ((*(this->detector)) (&coding, &detect_info)
7268                         && highest
7269                         && (detect_info.found & (1 << category)))
7270                       {
7271                         if (category == coding_category_utf_16_auto)
7272                           {
7273                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7274                               category = coding_category_utf_16_le;
7275                             else
7276                               category = coding_category_utf_16_be;
7277                           }
7278                         break;
7279                       }
7280                   }
7281               }
7282         }
7283
7284       if (detect_info.rejected == CATEGORY_MASK_ANY)
7285         {
7286           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7287           id = coding_categories[coding_category_raw_text].id;
7288           val = Fcons (make_number (id), Qnil);
7289         }
7290       else if (! detect_info.rejected && ! detect_info.found)
7291         {
7292           detect_info.found = CATEGORY_MASK_ANY;
7293           id = coding_categories[coding_category_undecided].id;
7294           val = Fcons (make_number (id), Qnil);
7295         }
7296       else if (highest)
7297         {
7298           if (detect_info.found)
7299             {
7300               detect_info.found = 1 << category;
7301               val = Fcons (make_number (this->id), Qnil);
7302             }
7303           else
7304             for (i = 0; i < coding_category_raw_text; i++)
7305               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7306                 {
7307                   detect_info.found = 1 << coding_priorities[i];
7308                   id = coding_categories[coding_priorities[i]].id;
7309                   val = Fcons (make_number (id), Qnil);
7310                   break;
7311                 }
7312         }
7313       else
7314         {
7315           int mask = detect_info.rejected | detect_info.found;
7316           int found = 0;
7317           val = Qnil;
7318
7319           for (i = coding_category_raw_text - 1; i >= 0; i--)
7320             {
7321               category = coding_priorities[i];
7322               if (! (mask & (1 << category)))
7323                 {
7324                   found |= 1 << category;
7325                   id = coding_categories[category].id;
7326                   val = Fcons (make_number (id), val);
7327                 }
7328             }
7329           for (i = coding_category_raw_text - 1; i >= 0; i--)
7330             {
7331               category = coding_priorities[i];
7332               if (detect_info.found & (1 << category))
7333                 {
7334                   id = coding_categories[category].id;
7335                   val = Fcons (make_number (id), val);
7336                 }
7337             }
7338           detect_info.found |= found;
7339         }
7340     }
7341   else if (base_category == coding_category_utf_16_auto)
7342     {
7343       if (detect_coding_utf_16 (&coding, &detect_info))
7344         {
7345           struct coding_system *this;
7346
7347           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7348             this = coding_categories + coding_category_utf_16_le;
7349           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7350             this = coding_categories + coding_category_utf_16_be;
7351           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7352             this = coding_categories + coding_category_utf_16_be_nosig;
7353           else
7354             this = coding_categories + coding_category_utf_16_le_nosig;
7355           val = Fcons (make_number (this->id), Qnil);
7356         }
7357     }
7358   else
7359     {
7360       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7361       val = Fcons (make_number (coding.id), Qnil);
7362     }
7363
7364   /* Then, detect eol-format if necessary.  */
7365   {
7366     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7367     Lisp_Object tail;
7368
7369     if (VECTORP (eol_type))
7370       {
7371         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7372           normal_eol = detect_eol (coding.source, src_bytes,
7373                                    coding_category_raw_text);
7374         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7375                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7376           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7377                                       coding_category_utf_16_be);
7378         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7379                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7380           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7381                                       coding_category_utf_16_le);
7382       }
7383     else
7384       {
7385         if (EQ (eol_type, Qunix))
7386           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7387         else if (EQ (eol_type, Qdos))
7388           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7389         else
7390           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7391       }
7392
7393     for (tail = val; CONSP (tail); tail = XCDR (tail))
7394       {
7395         enum coding_category category;
7396         int this_eol;
7397
7398         id = XINT (XCAR (tail));
7399         attrs = CODING_ID_ATTRS (id);
7400         category = XINT (CODING_ATTR_CATEGORY (attrs));
7401         eol_type = CODING_ID_EOL_TYPE (id);
7402         if (VECTORP (eol_type))
7403           {
7404             if (category == coding_category_utf_16_be
7405                 || category == coding_category_utf_16_be_nosig)
7406               this_eol = utf_16_be_eol;
7407             else if (category == coding_category_utf_16_le
7408                      || category == coding_category_utf_16_le_nosig)
7409               this_eol = utf_16_le_eol;
7410             else
7411               this_eol = normal_eol;
7412
7413             if (this_eol == EOL_SEEN_LF)
7414               XSETCAR (tail, AREF (eol_type, 0));
7415             else if (this_eol == EOL_SEEN_CRLF)
7416               XSETCAR (tail, AREF (eol_type, 1));
7417             else if (this_eol == EOL_SEEN_CR)
7418               XSETCAR (tail, AREF (eol_type, 2));
7419             else
7420               XSETCAR (tail, CODING_ID_NAME (id));
7421           }
7422         else
7423           XSETCAR (tail, CODING_ID_NAME (id));
7424       }
7425   }
7426
7427   return (highest ? XCAR (val) : val);
7428 }
7429
7430
7431 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7432        2, 3, 0,
7433        doc: /* Detect coding system of the text in the region between START and END.
7434 Return a list of possible coding systems ordered by priority.
7435
7436 If only ASCII characters are found, it returns a list of single element
7437 `undecided' or its subsidiary coding system according to a detected
7438 end-of-line format.
7439
7440 If optional argument HIGHEST is non-nil, return the coding system of
7441 highest priority.  */)
7442      (start, end, highest)
7443      Lisp_Object start, end, highest;
7444 {
7445   int from, to;
7446   int from_byte, to_byte;
7447
7448   CHECK_NUMBER_COERCE_MARKER (start);
7449   CHECK_NUMBER_COERCE_MARKER (end);
7450
7451   validate_region (&start, &end);
7452   from = XINT (start), to = XINT (end);
7453   from_byte = CHAR_TO_BYTE (from);
7454   to_byte = CHAR_TO_BYTE (to);
7455
7456   if (from < GPT && to >= GPT)
7457     move_gap_both (to, to_byte);
7458
7459   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7460                                to - from, to_byte - from_byte,
7461                                !NILP (highest),
7462                                !NILP (current_buffer
7463                                       ->enable_multibyte_characters),
7464                                Qnil);
7465 }
7466
7467 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7468        1, 2, 0,
7469        doc: /* Detect coding system of the text in STRING.
7470 Return a list of possible coding systems ordered by priority.
7471
7472 If only ASCII characters are found, it returns a list of single element
7473 `undecided' or its subsidiary coding system according to a detected
7474 end-of-line format.
7475
7476 If optional argument HIGHEST is non-nil, return the coding system of
7477 highest priority.  */)
7478      (string, highest)
7479      Lisp_Object string, highest;
7480 {
7481   CHECK_STRING (string);
7482
7483   return detect_coding_system (SDATA (string),
7484                                SCHARS (string), SBYTES (string),
7485                                !NILP (highest), STRING_MULTIBYTE (string),
7486                                Qnil);
7487 }
7488
7489
7490 static INLINE int
7491 char_encodable_p (c, attrs)
7492      int c;
7493      Lisp_Object attrs;
7494 {
7495   Lisp_Object tail;
7496   struct charset *charset;
7497   Lisp_Object translation_table;
7498
7499   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7500   if (! NILP (translation_table))
7501     c = translate_char (translation_table, c);
7502   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7503        CONSP (tail); tail = XCDR (tail))
7504     {
7505       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7506       if (CHAR_CHARSET_P (c, charset))
7507         break;
7508     }
7509   return (! NILP (tail));
7510 }
7511
7512
7513 /* Return a list of coding systems that safely encode the text between
7514    START and END.  If EXCLUDE is non-nil, it is a list of coding
7515    systems not to check.  The returned list doesn't contain any such
7516    coding systems.  In any case, if the text contains only ASCII or is
7517    unibyte, return t.  */
7518
7519 DEFUN ("find-coding-systems-region-internal",
7520        Ffind_coding_systems_region_internal,
7521        Sfind_coding_systems_region_internal, 2, 3, 0,
7522        doc: /* Internal use only.  */)
7523      (start, end, exclude)
7524      Lisp_Object start, end, exclude;
7525 {
7526   Lisp_Object coding_attrs_list, safe_codings;
7527   EMACS_INT start_byte, end_byte;
7528   const unsigned char *p, *pbeg, *pend;
7529   int c;
7530   Lisp_Object tail, elt;
7531
7532   if (STRINGP (start))
7533     {
7534       if (!STRING_MULTIBYTE (start)
7535           || SCHARS (start) == SBYTES (start))
7536         return Qt;
7537       start_byte = 0;
7538       end_byte = SBYTES (start);
7539     }
7540   else
7541     {
7542       CHECK_NUMBER_COERCE_MARKER (start);
7543       CHECK_NUMBER_COERCE_MARKER (end);
7544       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7545         args_out_of_range (start, end);
7546       if (NILP (current_buffer->enable_multibyte_characters))
7547         return Qt;
7548       start_byte = CHAR_TO_BYTE (XINT (start));
7549       end_byte = CHAR_TO_BYTE (XINT (end));
7550       if (XINT (end) - XINT (start) == end_byte - start_byte)
7551         return Qt;
7552
7553       if (XINT (start) < GPT && XINT (end) > GPT)
7554         {
7555           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7556             move_gap_both (XINT (start), start_byte);
7557           else
7558             move_gap_both (XINT (end), end_byte);
7559         }
7560     }
7561
7562   coding_attrs_list = Qnil;
7563   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7564     if (NILP (exclude)
7565         || NILP (Fmemq (XCAR (tail), exclude)))
7566       {
7567         Lisp_Object attrs;
7568
7569         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7570         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7571             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7572           {
7573             ASET (attrs, coding_attr_trans_tbl,
7574                   get_translation_table (attrs, 1, NULL));
7575             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7576           }
7577       }
7578
7579   if (STRINGP (start))
7580     p = pbeg = SDATA (start);
7581   else
7582     p = pbeg = BYTE_POS_ADDR (start_byte);
7583   pend = p + (end_byte - start_byte);
7584
7585   while (p < pend && ASCII_BYTE_P (*p)) p++;
7586   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7587
7588   while (p < pend)
7589     {
7590       if (ASCII_BYTE_P (*p))
7591         p++;
7592       else
7593         {
7594           c = STRING_CHAR_ADVANCE (p);
7595
7596           charset_map_loaded = 0;
7597           for (tail = coding_attrs_list; CONSP (tail);)
7598             {
7599               elt = XCAR (tail);
7600               if (NILP (elt))
7601                 tail = XCDR (tail);
7602               else if (char_encodable_p (c, elt))
7603                 tail = XCDR (tail);
7604               else if (CONSP (XCDR (tail)))
7605                 {
7606                   XSETCAR (tail, XCAR (XCDR (tail)));
7607                   XSETCDR (tail, XCDR (XCDR (tail)));
7608                 }
7609               else
7610                 {
7611                   XSETCAR (tail, Qnil);
7612                   tail = XCDR (tail);
7613                 }
7614             }
7615           if (charset_map_loaded)
7616             {
7617               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7618
7619               if (STRINGP (start))
7620                 pbeg = SDATA (start);
7621               else
7622                 pbeg = BYTE_POS_ADDR (start_byte);
7623               p = pbeg + p_offset;
7624               pend = pbeg + pend_offset;
7625             }
7626         }
7627     }
7628
7629   safe_codings = list2 (Qraw_text, Qno_conversion);
7630   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7631     if (! NILP (XCAR (tail)))
7632       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7633
7634   return safe_codings;
7635 }
7636
7637
7638 DEFUN ("unencodable-char-position", Funencodable_char_position,
7639        Sunencodable_char_position, 3, 5, 0,
7640        doc: /*
7641 Return position of first un-encodable character in a region.
7642 START and END specfiy the region and CODING-SYSTEM specifies the
7643 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7644
7645 If optional 4th argument COUNT is non-nil, it specifies at most how
7646 many un-encodable characters to search.  In this case, the value is a
7647 list of positions.
7648
7649 If optional 5th argument STRING is non-nil, it is a string to search
7650 for un-encodable characters.  In that case, START and END are indexes
7651 to the string.  */)
7652      (start, end, coding_system, count, string)
7653      Lisp_Object start, end, coding_system, count, string;
7654 {
7655   int n;
7656   struct coding_system coding;
7657   Lisp_Object attrs, charset_list, translation_table;
7658   Lisp_Object positions;
7659   int from, to;
7660   const unsigned char *p, *stop, *pend;
7661   int ascii_compatible;
7662
7663   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7664   attrs = CODING_ID_ATTRS (coding.id);
7665   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7666     return Qnil;
7667   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7668   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7669   translation_table = get_translation_table (attrs, 1, NULL);
7670
7671   if (NILP (string))
7672     {
7673       validate_region (&start, &end);
7674       from = XINT (start);
7675       to = XINT (end);
7676       if (NILP (current_buffer->enable_multibyte_characters)
7677           || (ascii_compatible
7678               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7679         return Qnil;
7680       p = CHAR_POS_ADDR (from);
7681       pend = CHAR_POS_ADDR (to);
7682       if (from < GPT && to >= GPT)
7683         stop = GPT_ADDR;
7684       else
7685         stop = pend;
7686     }
7687   else
7688     {
7689       CHECK_STRING (string);
7690       CHECK_NATNUM (start);
7691       CHECK_NATNUM (end);
7692       from = XINT (start);
7693       to = XINT (end);
7694       if (from > to
7695           || to > SCHARS (string))
7696         args_out_of_range_3 (string, start, end);
7697       if (! STRING_MULTIBYTE (string))
7698         return Qnil;
7699       p = SDATA (string) + string_char_to_byte (string, from);
7700       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7701       if (ascii_compatible && (to - from) == (pend - p))
7702         return Qnil;
7703     }
7704
7705   if (NILP (count))
7706     n = 1;
7707   else
7708     {
7709       CHECK_NATNUM (count);
7710       n = XINT (count);
7711     }
7712
7713   positions = Qnil;
7714   while (1)
7715     {
7716       int c;
7717
7718       if (ascii_compatible)
7719         while (p < stop && ASCII_BYTE_P (*p))
7720           p++, from++;
7721       if (p >= stop)
7722         {
7723           if (p >= pend)
7724             break;
7725           stop = pend;
7726           p = GAP_END_ADDR;
7727         }
7728
7729       c = STRING_CHAR_ADVANCE (p);
7730       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7731           && ! char_charset (translate_char (translation_table, c),
7732                              charset_list, NULL))
7733         {
7734           positions = Fcons (make_number (from), positions);
7735           n--;
7736           if (n == 0)
7737             break;
7738         }
7739
7740       from++;
7741     }
7742
7743   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7744 }
7745
7746
7747 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7748        Scheck_coding_systems_region, 3, 3, 0,
7749        doc: /* Check if the region is encodable by coding systems.
7750
7751 START and END are buffer positions specifying the region.
7752 CODING-SYSTEM-LIST is a list of coding systems to check.
7753
7754 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7755 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7756 whole region, POS0, POS1, ... are buffer positions where non-encodable
7757 characters are found.
7758
7759 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7760 value is nil.
7761
7762 START may be a string.  In that case, check if the string is
7763 encodable, and the value contains indices to the string instead of
7764 buffer positions.  END is ignored.  */)
7765      (start, end, coding_system_list)
7766      Lisp_Object start, end, coding_system_list;
7767 {
7768   Lisp_Object list;
7769   EMACS_INT start_byte, end_byte;
7770   int pos;
7771   const unsigned char *p, *pbeg, *pend;
7772   int c;
7773   Lisp_Object tail, elt, attrs;
7774
7775   if (STRINGP (start))
7776     {
7777       if (!STRING_MULTIBYTE (start)
7778           && SCHARS (start) != SBYTES (start))
7779         return Qnil;
7780       start_byte = 0;
7781       end_byte = SBYTES (start);
7782       pos = 0;
7783     }
7784   else
7785     {
7786       CHECK_NUMBER_COERCE_MARKER (start);
7787       CHECK_NUMBER_COERCE_MARKER (end);
7788       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7789         args_out_of_range (start, end);
7790       if (NILP (current_buffer->enable_multibyte_characters))
7791         return Qnil;
7792       start_byte = CHAR_TO_BYTE (XINT (start));
7793       end_byte = CHAR_TO_BYTE (XINT (end));
7794       if (XINT (end) - XINT (start) == end_byte - start_byte)
7795         return Qt;
7796
7797       if (XINT (start) < GPT && XINT (end) > GPT)
7798         {
7799           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7800             move_gap_both (XINT (start), start_byte);
7801           else
7802             move_gap_both (XINT (end), end_byte);
7803         }
7804       pos = XINT (start);
7805     }
7806
7807   list = Qnil;
7808   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7809     {
7810       elt = XCAR (tail);
7811       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7812       ASET (attrs, coding_attr_trans_tbl,
7813             get_translation_table (attrs, 1, NULL));
7814       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7815     }
7816
7817   if (STRINGP (start))
7818     p = pbeg = SDATA (start);
7819   else
7820     p = pbeg = BYTE_POS_ADDR (start_byte);
7821   pend = p + (end_byte - start_byte);
7822
7823   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7824   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7825
7826   while (p < pend)
7827     {
7828       if (ASCII_BYTE_P (*p))
7829         p++;
7830       else
7831         {
7832           c = STRING_CHAR_ADVANCE (p);
7833
7834           charset_map_loaded = 0;
7835           for (tail = list; CONSP (tail); tail = XCDR (tail))
7836             {
7837               elt = XCDR (XCAR (tail));
7838               if (! char_encodable_p (c, XCAR (elt)))
7839                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7840             }
7841           if (charset_map_loaded)
7842             {
7843               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7844
7845               if (STRINGP (start))
7846                 pbeg = SDATA (start);
7847               else
7848                 pbeg = BYTE_POS_ADDR (start_byte);
7849               p = pbeg + p_offset;
7850               pend = pbeg + pend_offset;
7851             }
7852         }
7853       pos++;
7854     }
7855
7856   tail = list;
7857   list = Qnil;
7858   for (; CONSP (tail); tail = XCDR (tail))
7859     {
7860       elt = XCAR (tail);
7861       if (CONSP (XCDR (XCDR (elt))))
7862         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7863                       list);
7864     }
7865
7866   return list;
7867 }
7868
7869
7870 Lisp_Object
7871 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7872      Lisp_Object start, end, coding_system, dst_object;
7873      int encodep, norecord;
7874 {
7875   struct coding_system coding;
7876   EMACS_INT from, from_byte, to, to_byte;
7877   Lisp_Object src_object;
7878
7879   CHECK_NUMBER_COERCE_MARKER (start);
7880   CHECK_NUMBER_COERCE_MARKER (end);
7881   if (NILP (coding_system))
7882     coding_system = Qno_conversion;
7883   else
7884     CHECK_CODING_SYSTEM (coding_system);
7885   src_object = Fcurrent_buffer ();
7886   if (NILP (dst_object))
7887     dst_object = src_object;
7888   else if (! EQ (dst_object, Qt))
7889     CHECK_BUFFER (dst_object);
7890
7891   validate_region (&start, &end);
7892   from = XFASTINT (start);
7893   from_byte = CHAR_TO_BYTE (from);
7894   to = XFASTINT (end);
7895   to_byte = CHAR_TO_BYTE (to);
7896
7897   setup_coding_system (coding_system, &coding);
7898   coding.mode |= CODING_MODE_LAST_BLOCK;
7899
7900   if (encodep)
7901     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7902                           dst_object);
7903   else
7904     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7905                           dst_object);
7906   if (! norecord)
7907     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7908
7909   return (BUFFERP (dst_object)
7910           ? make_number (coding.produced_char)
7911           : coding.dst_object);
7912 }
7913
7914
7915 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7916        3, 4, "r\nzCoding system: ",
7917        doc: /* Decode the current region from the specified coding system.
7918 When called from a program, takes four arguments:
7919         START, END, CODING-SYSTEM, and DESTINATION.
7920 START and END are buffer positions.
7921
7922 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7923 If nil, the region between START and END is replace by the decoded text.
7924 If buffer, the decoded text is inserted in the buffer.
7925 If t, the decoded text is returned.
7926
7927 This function sets `last-coding-system-used' to the precise coding system
7928 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7929 not fully specified.)
7930 It returns the length of the decoded text.  */)
7931      (start, end, coding_system, destination)
7932      Lisp_Object start, end, coding_system, destination;
7933 {
7934   return code_convert_region (start, end, coding_system, destination, 0, 0);
7935 }
7936
7937 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7938        3, 4, "r\nzCoding system: ",
7939        doc: /* Encode the current region by specified coding system.
7940 When called from a program, takes three arguments:
7941 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7942
7943 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7944 If nil, the region between START and END is replace by the encoded text.
7945 If buffer, the encoded text is inserted in the buffer.
7946 If t, the encoded text is returned.
7947
7948 This function sets `last-coding-system-used' to the precise coding system
7949 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7950 not fully specified.)
7951 It returns the length of the encoded text.  */)
7952   (start, end, coding_system, destination)
7953      Lisp_Object start, end, coding_system, destination;
7954 {
7955   return code_convert_region (start, end, coding_system, destination, 1, 0);
7956 }
7957
7958 Lisp_Object
7959 code_convert_string (string, coding_system, dst_object,
7960                      encodep, nocopy, norecord)
7961      Lisp_Object string, coding_system, dst_object;
7962      int encodep, nocopy, norecord;
7963 {
7964   struct coding_system coding;
7965   EMACS_INT chars, bytes;
7966
7967   CHECK_STRING (string);
7968   if (NILP (coding_system))
7969     {
7970       if (! norecord)
7971         Vlast_coding_system_used = Qno_conversion;
7972       if (NILP (dst_object))
7973         return (nocopy ? Fcopy_sequence (string) : string);
7974     }
7975
7976   if (NILP (coding_system))
7977     coding_system = Qno_conversion;
7978   else
7979     CHECK_CODING_SYSTEM (coding_system);
7980   if (NILP (dst_object))
7981     dst_object = Qt;
7982   else if (! EQ (dst_object, Qt))
7983     CHECK_BUFFER (dst_object);
7984
7985   setup_coding_system (coding_system, &coding);
7986   coding.mode |= CODING_MODE_LAST_BLOCK;
7987   chars = SCHARS (string);
7988   bytes = SBYTES (string);
7989   if (encodep)
7990     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7991   else
7992     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7993   if (! norecord)
7994     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7995
7996   return (BUFFERP (dst_object)
7997           ? make_number (coding.produced_char)
7998           : coding.dst_object);
7999 }
8000
8001
8002 /* Encode or decode STRING according to CODING_SYSTEM.
8003    Do not set Vlast_coding_system_used.
8004
8005    This function is called only from macros DECODE_FILE and
8006    ENCODE_FILE, thus we ignore character composition.  */
8007
8008 Lisp_Object
8009 code_convert_string_norecord (string, coding_system, encodep)
8010      Lisp_Object string, coding_system;
8011      int encodep;
8012 {
8013   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8014 }
8015
8016
8017 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8018        2, 4, 0,
8019        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8020
8021 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8022 if the decoding operation is trivial.
8023
8024 Optional fourth arg BUFFER non-nil meant that the decoded text is
8025 inserted in BUFFER instead of returned as a string.  In this case,
8026 the return value is BUFFER.
8027
8028 This function sets `last-coding-system-used' to the precise coding system
8029 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8030 not fully specified.  */)
8031   (string, coding_system, nocopy, buffer)
8032      Lisp_Object string, coding_system, nocopy, buffer;
8033 {
8034   return code_convert_string (string, coding_system, buffer,
8035                               0, ! NILP (nocopy), 0);
8036 }
8037
8038 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8039        2, 4, 0,
8040        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8041
8042 Optional third arg NOCOPY non-nil means it is OK to return STRING
8043 itself if the encoding operation is trivial.
8044
8045 Optional fourth arg BUFFER non-nil meant that the encoded text is
8046 inserted in BUFFER instead of returned as a string.  In this case,
8047 the return value is BUFFER.
8048
8049 This function sets `last-coding-system-used' to the precise coding system
8050 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8051 not fully specified.)  */)
8052      (string, coding_system, nocopy, buffer)
8053      Lisp_Object string, coding_system, nocopy, buffer;
8054 {
8055   return code_convert_string (string, coding_system, buffer,
8056                               1, ! NILP (nocopy), 1);
8057 }
8058
8059 \f
8060 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8061        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8062 Return the corresponding character.  */)
8063      (code)
8064      Lisp_Object code;
8065 {
8066   Lisp_Object spec, attrs, val;
8067   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8068   int c;
8069
8070   CHECK_NATNUM (code);
8071   c = XFASTINT (code);
8072   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8073   attrs = AREF (spec, 0);
8074
8075   if (ASCII_BYTE_P (c)
8076       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8077     return code;
8078
8079   val = CODING_ATTR_CHARSET_LIST (attrs);
8080   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8081   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8082   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8083
8084   if (c <= 0x7F)
8085     charset = charset_roman;
8086   else if (c >= 0xA0 && c < 0xDF)
8087     {
8088       charset = charset_kana;
8089       c -= 0x80;
8090     }
8091   else
8092     {
8093       int s1 = c >> 8, s2 = c & 0xFF;
8094
8095       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8096           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8097         error ("Invalid code: %d", code);
8098       SJIS_TO_JIS (c);
8099       charset = charset_kanji;
8100     }
8101   c = DECODE_CHAR (charset, c);
8102   if (c < 0)
8103     error ("Invalid code: %d", code);
8104   return make_number (c);
8105 }
8106
8107
8108 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8109        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8110 Return the corresponding code in SJIS.  */)
8111      (ch)
8112     Lisp_Object ch;
8113 {
8114   Lisp_Object spec, attrs, charset_list;
8115   int c;
8116   struct charset *charset;
8117   unsigned code;
8118
8119   CHECK_CHARACTER (ch);
8120   c = XFASTINT (ch);
8121   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8122   attrs = AREF (spec, 0);
8123
8124   if (ASCII_CHAR_P (c)
8125       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8126     return ch;
8127
8128   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8129   charset = char_charset (c, charset_list, &code);
8130   if (code == CHARSET_INVALID_CODE (charset))
8131     error ("Can't encode by shift_jis encoding: %d", c);
8132   JIS_TO_SJIS (code);
8133
8134   return make_number (code);
8135 }
8136
8137 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8138        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8139 Return the corresponding character.  */)
8140      (code)
8141      Lisp_Object code;
8142 {
8143   Lisp_Object spec, attrs, val;
8144   struct charset *charset_roman, *charset_big5, *charset;
8145   int c;
8146
8147   CHECK_NATNUM (code);
8148   c = XFASTINT (code);
8149   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8150   attrs = AREF (spec, 0);
8151
8152   if (ASCII_BYTE_P (c)
8153       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8154     return code;
8155
8156   val = CODING_ATTR_CHARSET_LIST (attrs);
8157   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8158   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8159
8160   if (c <= 0x7F)
8161     charset = charset_roman;
8162   else
8163     {
8164       int b1 = c >> 8, b2 = c & 0x7F;
8165       if (b1 < 0xA1 || b1 > 0xFE
8166           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8167         error ("Invalid code: %d", code);
8168       charset = charset_big5;
8169     }
8170   c = DECODE_CHAR (charset, (unsigned )c);
8171   if (c < 0)
8172     error ("Invalid code: %d", code);
8173   return make_number (c);
8174 }
8175
8176 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8177        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8178 Return the corresponding character code in Big5.  */)
8179      (ch)
8180      Lisp_Object ch;
8181 {
8182   Lisp_Object spec, attrs, charset_list;
8183   struct charset *charset;
8184   int c;
8185   unsigned code;
8186
8187   CHECK_CHARACTER (ch);
8188   c = XFASTINT (ch);
8189   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8190   attrs = AREF (spec, 0);
8191   if (ASCII_CHAR_P (c)
8192       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8193     return ch;
8194
8195   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8196   charset = char_charset (c, charset_list, &code);
8197   if (code == CHARSET_INVALID_CODE (charset))
8198     error ("Can't encode by Big5 encoding: %d", c);
8199
8200   return make_number (code);
8201 }
8202
8203 \f
8204 DEFUN ("set-terminal-coding-system-internal",
8205        Fset_terminal_coding_system_internal,
8206        Sset_terminal_coding_system_internal, 1, 1, 0,
8207        doc: /* Internal use only.  */)
8208      (coding_system)
8209      Lisp_Object coding_system;
8210 {
8211   CHECK_SYMBOL (coding_system);
8212   setup_coding_system (Fcheck_coding_system (coding_system),
8213                         &terminal_coding);
8214
8215   /* We had better not send unsafe characters to terminal.  */
8216   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8217   /* Characer composition should be disabled.  */
8218   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8219   terminal_coding.src_multibyte = 1;
8220   terminal_coding.dst_multibyte = 0;
8221   return Qnil;
8222 }
8223
8224 DEFUN ("set-safe-terminal-coding-system-internal",
8225        Fset_safe_terminal_coding_system_internal,
8226        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8227        doc: /* Internal use only.  */)
8228      (coding_system)
8229      Lisp_Object coding_system;
8230 {
8231   CHECK_SYMBOL (coding_system);
8232   setup_coding_system (Fcheck_coding_system (coding_system),
8233                        &safe_terminal_coding);
8234   /* Characer composition should be disabled.  */
8235   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8236   safe_terminal_coding.src_multibyte = 1;
8237   safe_terminal_coding.dst_multibyte = 0;
8238   return Qnil;
8239 }
8240
8241 DEFUN ("terminal-coding-system",
8242        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8243        doc: /* Return coding system specified for terminal output.  */)
8244      ()
8245 {
8246   Lisp_Object coding_system;
8247
8248   coding_system = CODING_ID_NAME (terminal_coding.id);
8249   /* For backward compatibility, return nil if it is `undecided'. */
8250   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8251 }
8252
8253 DEFUN ("set-keyboard-coding-system-internal",
8254        Fset_keyboard_coding_system_internal,
8255        Sset_keyboard_coding_system_internal, 1, 1, 0,
8256        doc: /* Internal use only.  */)
8257      (coding_system)
8258      Lisp_Object coding_system;
8259 {
8260   CHECK_SYMBOL (coding_system);
8261   setup_coding_system (Fcheck_coding_system (coding_system),
8262                        &keyboard_coding);
8263   /* Characer composition should be disabled.  */
8264   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8265   return Qnil;
8266 }
8267
8268 DEFUN ("keyboard-coding-system",
8269        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8270        doc: /* Return coding system specified for decoding keyboard input.  */)
8271      ()
8272 {
8273   return CODING_ID_NAME (keyboard_coding.id);
8274 }
8275
8276 \f
8277 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8278        Sfind_operation_coding_system,  1, MANY, 0,
8279        doc: /* Choose a coding system for an operation based on the target name.
8280 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8281 DECODING-SYSTEM is the coding system to use for decoding
8282 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8283 for encoding (in case OPERATION does encoding).
8284
8285 The first argument OPERATION specifies an I/O primitive:
8286   For file I/O, `insert-file-contents' or `write-region'.
8287   For process I/O, `call-process', `call-process-region', or `start-process'.
8288   For network I/O, `open-network-stream'.
8289
8290 The remaining arguments should be the same arguments that were passed
8291 to the primitive.  Depending on which primitive, one of those arguments
8292 is selected as the TARGET.  For example, if OPERATION does file I/O,
8293 whichever argument specifies the file name is TARGET.
8294
8295 TARGET has a meaning which depends on OPERATION:
8296   For file I/O, TARGET is a file name.
8297   For process I/O, TARGET is a process name.
8298   For network I/O, TARGET is a service name or a port number
8299
8300 This function looks up what specified for TARGET in,
8301 `file-coding-system-alist', `process-coding-system-alist',
8302 or `network-coding-system-alist' depending on OPERATION.
8303 They may specify a coding system, a cons of coding systems,
8304 or a function symbol to call.
8305 In the last case, we call the function with one argument,
8306 which is a list of all the arguments given to this function.
8307
8308 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8309      (nargs, args)
8310      int nargs;
8311      Lisp_Object *args;
8312 {
8313   Lisp_Object operation, target_idx, target, val;
8314   register Lisp_Object chain;
8315
8316   if (nargs < 2)
8317     error ("Too few arguments");
8318   operation = args[0];
8319   if (!SYMBOLP (operation)
8320       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8321     error ("Invalid first arguement");
8322   if (nargs < 1 + XINT (target_idx))
8323     error ("Too few arguments for operation: %s",
8324            SDATA (SYMBOL_NAME (operation)));
8325   target = args[XINT (target_idx) + 1];
8326   if (!(STRINGP (target)
8327         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8328     error ("Invalid %dth argument", XINT (target_idx) + 1);
8329
8330   chain = ((EQ (operation, Qinsert_file_contents)
8331             || EQ (operation, Qwrite_region))
8332            ? Vfile_coding_system_alist
8333            : (EQ (operation, Qopen_network_stream)
8334               ? Vnetwork_coding_system_alist
8335               : Vprocess_coding_system_alist));
8336   if (NILP (chain))
8337     return Qnil;
8338
8339   for (; CONSP (chain); chain = XCDR (chain))
8340     {
8341       Lisp_Object elt;
8342
8343       elt = XCAR (chain);
8344       if (CONSP (elt)
8345           && ((STRINGP (target)
8346                && STRINGP (XCAR (elt))
8347                && fast_string_match (XCAR (elt), target) >= 0)
8348               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8349         {
8350           val = XCDR (elt);
8351           /* Here, if VAL is both a valid coding system and a valid
8352              function symbol, we return VAL as a coding system.  */
8353           if (CONSP (val))
8354             return val;
8355           if (! SYMBOLP (val))
8356             return Qnil;
8357           if (! NILP (Fcoding_system_p (val)))
8358             return Fcons (val, val);
8359           if (! NILP (Ffboundp (val)))
8360             {
8361               val = call1 (val, Flist (nargs, args));
8362               if (CONSP (val))
8363                 return val;
8364               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8365                 return Fcons (val, val);
8366             }
8367           return Qnil;
8368         }
8369     }
8370   return Qnil;
8371 }
8372
8373 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8374        Sset_coding_system_priority, 0, MANY, 0,
8375        doc: /* Assign higher priority to the coding systems given as arguments.
8376 If multiple coding systems belongs to the same category,
8377 all but the first one are ignored.
8378
8379 usage: (set-coding-system-priority ...)  */)
8380      (nargs, args)
8381      int nargs;
8382      Lisp_Object *args;
8383 {
8384   int i, j;
8385   int changed[coding_category_max];
8386   enum coding_category priorities[coding_category_max];
8387
8388   bzero (changed, sizeof changed);
8389
8390   for (i = j = 0; i < nargs; i++)
8391     {
8392       enum coding_category category;
8393       Lisp_Object spec, attrs;
8394
8395       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8396       attrs = AREF (spec, 0);
8397       category = XINT (CODING_ATTR_CATEGORY (attrs));
8398       if (changed[category])
8399         /* Ignore this coding system because a coding system of the
8400            same category already had a higher priority.  */
8401         continue;
8402       changed[category] = 1;
8403       priorities[j++] = category;
8404       if (coding_categories[category].id >= 0
8405           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8406         setup_coding_system (args[i], &coding_categories[category]);
8407       Fset (AREF (Vcoding_category_table, category), args[i]);
8408     }
8409
8410   /* Now we have decided top J priorities.  Reflect the order of the
8411      original priorities to the remaining priorities.  */
8412
8413   for (i = j, j = 0; i < coding_category_max; i++, j++)
8414     {
8415       while (j < coding_category_max
8416              && changed[coding_priorities[j]])
8417         j++;
8418       if (j == coding_category_max)
8419         abort ();
8420       priorities[i] = coding_priorities[j];
8421     }
8422
8423   bcopy (priorities, coding_priorities, sizeof priorities);
8424
8425   /* Update `coding-category-list'.  */
8426   Vcoding_category_list = Qnil;
8427   for (i = coding_category_max - 1; i >= 0; i--)
8428     Vcoding_category_list
8429       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8430                Vcoding_category_list);
8431
8432   return Qnil;
8433 }
8434
8435 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8436        Scoding_system_priority_list, 0, 1, 0,
8437        doc: /* Return a list of coding systems ordered by their priorities.
8438 HIGHESTP non-nil means just return the highest priority one.  */)
8439      (highestp)
8440      Lisp_Object highestp;
8441 {
8442   int i;
8443   Lisp_Object val;
8444
8445   for (i = 0, val = Qnil; i < coding_category_max; i++)
8446     {
8447       enum coding_category category = coding_priorities[i];
8448       int id = coding_categories[category].id;
8449       Lisp_Object attrs;
8450
8451       if (id < 0)
8452         continue;
8453       attrs = CODING_ID_ATTRS (id);
8454       if (! NILP (highestp))
8455         return CODING_ATTR_BASE_NAME (attrs);
8456       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8457     }
8458   return Fnreverse (val);
8459 }
8460
8461 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8462
8463 static Lisp_Object
8464 make_subsidiaries (base)
8465      Lisp_Object base;
8466 {
8467   Lisp_Object subsidiaries;
8468   int base_name_len = SBYTES (SYMBOL_NAME (base));
8469   char *buf = (char *) alloca (base_name_len + 6);
8470   int i;
8471
8472   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8473   subsidiaries = Fmake_vector (make_number (3), Qnil);
8474   for (i = 0; i < 3; i++)
8475     {
8476       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8477       ASET (subsidiaries, i, intern (buf));
8478     }
8479   return subsidiaries;
8480 }
8481
8482
8483 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8484        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8485        doc: /* For internal use only.
8486 usage: (define-coding-system-internal ...)  */)
8487      (nargs, args)
8488      int nargs;
8489      Lisp_Object *args;
8490 {
8491   Lisp_Object name;
8492   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8493   Lisp_Object attrs;            /* Vector of attributes.  */
8494   Lisp_Object eol_type;
8495   Lisp_Object aliases;
8496   Lisp_Object coding_type, charset_list, safe_charsets;
8497   enum coding_category category;
8498   Lisp_Object tail, val;
8499   int max_charset_id = 0;
8500   int i;
8501
8502   if (nargs < coding_arg_max)
8503     goto short_args;
8504
8505   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8506
8507   name = args[coding_arg_name];
8508   CHECK_SYMBOL (name);
8509   CODING_ATTR_BASE_NAME (attrs) = name;
8510
8511   val = args[coding_arg_mnemonic];
8512   if (! STRINGP (val))
8513     CHECK_CHARACTER (val);
8514   CODING_ATTR_MNEMONIC (attrs) = val;
8515
8516   coding_type = args[coding_arg_coding_type];
8517   CHECK_SYMBOL (coding_type);
8518   CODING_ATTR_TYPE (attrs) = coding_type;
8519
8520   charset_list = args[coding_arg_charset_list];
8521   if (SYMBOLP (charset_list))
8522     {
8523       if (EQ (charset_list, Qiso_2022))
8524         {
8525           if (! EQ (coding_type, Qiso_2022))
8526             error ("Invalid charset-list");
8527           charset_list = Viso_2022_charset_list;
8528         }
8529       else if (EQ (charset_list, Qemacs_mule))
8530         {
8531           if (! EQ (coding_type, Qemacs_mule))
8532             error ("Invalid charset-list");
8533           charset_list = Vemacs_mule_charset_list;
8534         }
8535       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8536         if (max_charset_id < XFASTINT (XCAR (tail)))
8537           max_charset_id = XFASTINT (XCAR (tail));
8538     }
8539   else
8540     {
8541       charset_list = Fcopy_sequence (charset_list);
8542       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8543         {
8544           struct charset *charset;
8545
8546           val = Fcar (tail);
8547           CHECK_CHARSET_GET_CHARSET (val, charset);
8548           if (EQ (coding_type, Qiso_2022)
8549               ? CHARSET_ISO_FINAL (charset) < 0
8550               : EQ (coding_type, Qemacs_mule)
8551               ? CHARSET_EMACS_MULE_ID (charset) < 0
8552               : 0)
8553             error ("Can't handle charset `%s'",
8554                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8555
8556           XSETCAR (tail, make_number (charset->id));
8557           if (max_charset_id < charset->id)
8558             max_charset_id = charset->id;
8559         }
8560     }
8561   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8562
8563   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8564                                 make_number (255));
8565   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8566     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8567   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8568
8569   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8570
8571   val = args[coding_arg_decode_translation_table];
8572   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8573     CHECK_SYMBOL (val);
8574   CODING_ATTR_DECODE_TBL (attrs) = val;
8575
8576   val = args[coding_arg_encode_translation_table];
8577   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8578     CHECK_SYMBOL (val);
8579   CODING_ATTR_ENCODE_TBL (attrs) = val;
8580
8581   val = args[coding_arg_post_read_conversion];
8582   CHECK_SYMBOL (val);
8583   CODING_ATTR_POST_READ (attrs) = val;
8584
8585   val = args[coding_arg_pre_write_conversion];
8586   CHECK_SYMBOL (val);
8587   CODING_ATTR_PRE_WRITE (attrs) = val;
8588
8589   val = args[coding_arg_default_char];
8590   if (NILP (val))
8591     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8592   else
8593     {
8594       CHECK_CHARACTER (val);
8595       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8596     }
8597
8598   val = args[coding_arg_for_unibyte];
8599   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8600
8601   val = args[coding_arg_plist];
8602   CHECK_LIST (val);
8603   CODING_ATTR_PLIST (attrs) = val;
8604
8605   if (EQ (coding_type, Qcharset))
8606     {
8607       /* Generate a lisp vector of 256 elements.  Each element is nil,
8608          integer, or a list of charset IDs.
8609
8610          If Nth element is nil, the byte code N is invalid in this
8611          coding system.
8612
8613          If Nth element is a number NUM, N is the first byte of a
8614          charset whose ID is NUM.
8615
8616          If Nth element is a list of charset IDs, N is the first byte
8617          of one of them.  The list is sorted by dimensions of the
8618          charsets.  A charset of smaller dimension comes firtst. */
8619       val = Fmake_vector (make_number (256), Qnil);
8620
8621       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8622         {
8623           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8624           int dim = CHARSET_DIMENSION (charset);
8625           int idx = (dim - 1) * 4;
8626
8627           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8628             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8629
8630           for (i = charset->code_space[idx];
8631                i <= charset->code_space[idx + 1]; i++)
8632             {
8633               Lisp_Object tmp, tmp2;
8634               int dim2;
8635
8636               tmp = AREF (val, i);
8637               if (NILP (tmp))
8638                 tmp = XCAR (tail);
8639               else if (NUMBERP (tmp))
8640                 {
8641                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8642                   if (dim < dim2)
8643                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8644                   else
8645                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8646                 }
8647               else
8648                 {
8649                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8650                     {
8651                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8652                       if (dim < dim2)
8653                         break;
8654                     }
8655                   if (NILP (tmp2))
8656                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8657                   else
8658                     {
8659                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8660                       XSETCAR (tmp2, XCAR (tail));
8661                     }
8662                 }
8663               ASET (val, i, tmp);
8664             }
8665         }
8666       ASET (attrs, coding_attr_charset_valids, val);
8667       category = coding_category_charset;
8668     }
8669   else if (EQ (coding_type, Qccl))
8670     {
8671       Lisp_Object valids;
8672
8673       if (nargs < coding_arg_ccl_max)
8674         goto short_args;
8675
8676       val = args[coding_arg_ccl_decoder];
8677       CHECK_CCL_PROGRAM (val);
8678       if (VECTORP (val))
8679         val = Fcopy_sequence (val);
8680       ASET (attrs, coding_attr_ccl_decoder, val);
8681
8682       val = args[coding_arg_ccl_encoder];
8683       CHECK_CCL_PROGRAM (val);
8684       if (VECTORP (val))
8685         val = Fcopy_sequence (val);
8686       ASET (attrs, coding_attr_ccl_encoder, val);
8687
8688       val = args[coding_arg_ccl_valids];
8689       valids = Fmake_string (make_number (256), make_number (0));
8690       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8691         {
8692           int from, to;
8693
8694           val = Fcar (tail);
8695           if (INTEGERP (val))
8696             {
8697               from = to = XINT (val);
8698               if (from < 0 || from > 255)
8699                 args_out_of_range_3 (val, make_number (0), make_number (255));
8700             }
8701           else
8702             {
8703               CHECK_CONS (val);
8704               CHECK_NATNUM_CAR (val);
8705               CHECK_NATNUM_CDR (val);
8706               from = XINT (XCAR (val));
8707               if (from > 255)
8708                 args_out_of_range_3 (XCAR (val),
8709                                      make_number (0), make_number (255));
8710               to = XINT (XCDR (val));
8711               if (to < from || to > 255)
8712                 args_out_of_range_3 (XCDR (val),
8713                                      XCAR (val), make_number (255));
8714             }
8715           for (i = from; i <= to; i++)
8716             SSET (valids, i, 1);
8717         }
8718       ASET (attrs, coding_attr_ccl_valids, valids);
8719
8720       category = coding_category_ccl;
8721     }
8722   else if (EQ (coding_type, Qutf_16))
8723     {
8724       Lisp_Object bom, endian;
8725
8726       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8727
8728       if (nargs < coding_arg_utf16_max)
8729         goto short_args;
8730
8731       bom = args[coding_arg_utf16_bom];
8732       if (! NILP (bom) && ! EQ (bom, Qt))
8733         {
8734           CHECK_CONS (bom);
8735           val = XCAR (bom);
8736           CHECK_CODING_SYSTEM (val);
8737           val = XCDR (bom);
8738           CHECK_CODING_SYSTEM (val);
8739         }
8740       ASET (attrs, coding_attr_utf_16_bom, bom);
8741
8742       endian = args[coding_arg_utf16_endian];
8743       CHECK_SYMBOL (endian);
8744       if (NILP (endian))
8745         endian = Qbig;
8746       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8747         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8748       ASET (attrs, coding_attr_utf_16_endian, endian);
8749
8750       category = (CONSP (bom)
8751                   ? coding_category_utf_16_auto
8752                   : NILP (bom)
8753                   ? (EQ (endian, Qbig)
8754                      ? coding_category_utf_16_be_nosig
8755                      : coding_category_utf_16_le_nosig)
8756                   : (EQ (endian, Qbig)
8757                      ? coding_category_utf_16_be
8758                      : coding_category_utf_16_le));
8759     }
8760   else if (EQ (coding_type, Qiso_2022))
8761     {
8762       Lisp_Object initial, reg_usage, request, flags;
8763       int i;
8764
8765       if (nargs < coding_arg_iso2022_max)
8766         goto short_args;
8767
8768       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8769       CHECK_VECTOR (initial);
8770       for (i = 0; i < 4; i++)
8771         {
8772           val = Faref (initial, make_number (i));
8773           if (! NILP (val))
8774             {
8775               struct charset *charset;
8776
8777               CHECK_CHARSET_GET_CHARSET (val, charset);
8778               ASET (initial, i, make_number (CHARSET_ID (charset)));
8779               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8780                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8781             }
8782           else
8783             ASET (initial, i, make_number (-1));
8784         }
8785
8786       reg_usage = args[coding_arg_iso2022_reg_usage];
8787       CHECK_CONS (reg_usage);
8788       CHECK_NUMBER_CAR (reg_usage);
8789       CHECK_NUMBER_CDR (reg_usage);
8790
8791       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8792       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8793         {
8794           int id;
8795           Lisp_Object tmp;
8796
8797           val = Fcar (tail);
8798           CHECK_CONS (val);
8799           tmp = XCAR (val);
8800           CHECK_CHARSET_GET_ID (tmp, id);
8801           CHECK_NATNUM_CDR (val);
8802           if (XINT (XCDR (val)) >= 4)
8803             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8804           XSETCAR (val, make_number (id));
8805         }
8806
8807       flags = args[coding_arg_iso2022_flags];
8808       CHECK_NATNUM (flags);
8809       i = XINT (flags);
8810       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8811         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8812
8813       ASET (attrs, coding_attr_iso_initial, initial);
8814       ASET (attrs, coding_attr_iso_usage, reg_usage);
8815       ASET (attrs, coding_attr_iso_request, request);
8816       ASET (attrs, coding_attr_iso_flags, flags);
8817       setup_iso_safe_charsets (attrs);
8818
8819       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8820         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8821                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8822                     ? coding_category_iso_7_else
8823                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8824                     ? coding_category_iso_7
8825                     : coding_category_iso_7_tight);
8826       else
8827         {
8828           int id = XINT (AREF (initial, 1));
8829
8830           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8831                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8832                        || id < 0)
8833                       ? coding_category_iso_8_else
8834                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8835                       ? coding_category_iso_8_1
8836                       : coding_category_iso_8_2);
8837         }
8838       if (category != coding_category_iso_8_1
8839           && category != coding_category_iso_8_2)
8840         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8841     }
8842   else if (EQ (coding_type, Qemacs_mule))
8843     {
8844       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8845         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8846       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8847       category = coding_category_emacs_mule;
8848     }
8849   else if (EQ (coding_type, Qshift_jis))
8850     {
8851
8852       struct charset *charset;
8853
8854       if (XINT (Flength (charset_list)) != 3
8855           && XINT (Flength (charset_list)) != 4)
8856         error ("There should be three or four charsets");
8857
8858       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8859       if (CHARSET_DIMENSION (charset) != 1)
8860         error ("Dimension of charset %s is not one",
8861                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8862       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8863         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8864
8865       charset_list = XCDR (charset_list);
8866       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8867       if (CHARSET_DIMENSION (charset) != 1)
8868         error ("Dimension of charset %s is not one",
8869                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8870
8871       charset_list = XCDR (charset_list);
8872       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8873       if (CHARSET_DIMENSION (charset) != 2)
8874         error ("Dimension of charset %s is not two",
8875                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8876
8877       charset_list = XCDR (charset_list);
8878       if (! NILP (charset_list))
8879         {
8880           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8881           if (CHARSET_DIMENSION (charset) != 2)
8882             error ("Dimension of charset %s is not two",
8883                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8884         }
8885
8886       category = coding_category_sjis;
8887       Vsjis_coding_system = name;
8888     }
8889   else if (EQ (coding_type, Qbig5))
8890     {
8891       struct charset *charset;
8892
8893       if (XINT (Flength (charset_list)) != 2)
8894         error ("There should be just two charsets");
8895
8896       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8897       if (CHARSET_DIMENSION (charset) != 1)
8898         error ("Dimension of charset %s is not one",
8899                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8900       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8901         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8902
8903       charset_list = XCDR (charset_list);
8904       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8905       if (CHARSET_DIMENSION (charset) != 2)
8906         error ("Dimension of charset %s is not two",
8907                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8908
8909       category = coding_category_big5;
8910       Vbig5_coding_system = name;
8911     }
8912   else if (EQ (coding_type, Qraw_text))
8913     {
8914       category = coding_category_raw_text;
8915       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8916     }
8917   else if (EQ (coding_type, Qutf_8))
8918     {
8919       category = coding_category_utf_8;
8920       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8921     }
8922   else if (EQ (coding_type, Qundecided))
8923     category = coding_category_undecided;
8924   else
8925     error ("Invalid coding system type: %s",
8926            SDATA (SYMBOL_NAME (coding_type)));
8927
8928   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8929   CODING_ATTR_PLIST (attrs)
8930     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8931                                 CODING_ATTR_PLIST (attrs)));
8932   CODING_ATTR_PLIST (attrs)
8933     = Fcons (QCascii_compatible_p,
8934              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8935                     CODING_ATTR_PLIST (attrs)));
8936
8937   eol_type = args[coding_arg_eol_type];
8938   if (! NILP (eol_type)
8939       && ! EQ (eol_type, Qunix)
8940       && ! EQ (eol_type, Qdos)
8941       && ! EQ (eol_type, Qmac))
8942     error ("Invalid eol-type");
8943
8944   aliases = Fcons (name, Qnil);
8945
8946   if (NILP (eol_type))
8947     {
8948       eol_type = make_subsidiaries (name);
8949       for (i = 0; i < 3; i++)
8950         {
8951           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8952
8953           this_name = AREF (eol_type, i);
8954           this_aliases = Fcons (this_name, Qnil);
8955           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8956           this_spec = Fmake_vector (make_number (3), attrs);
8957           ASET (this_spec, 1, this_aliases);
8958           ASET (this_spec, 2, this_eol_type);
8959           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8960           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8961           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8962           if (NILP (val))
8963             Vcoding_system_alist
8964               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8965                        Vcoding_system_alist);
8966         }
8967     }
8968
8969   spec_vec = Fmake_vector (make_number (3), attrs);
8970   ASET (spec_vec, 1, aliases);
8971   ASET (spec_vec, 2, eol_type);
8972
8973   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8974   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8975   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8976   if (NILP (val))
8977     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8978                                   Vcoding_system_alist);
8979
8980   {
8981     int id = coding_categories[category].id;
8982
8983     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8984       setup_coding_system (name, &coding_categories[category]);
8985   }
8986
8987   return Qnil;
8988
8989  short_args:
8990   return Fsignal (Qwrong_number_of_arguments,
8991                   Fcons (intern ("define-coding-system-internal"),
8992                          make_number (nargs)));
8993 }
8994
8995
8996 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8997        3, 3, 0,
8998        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8999   (coding_system, prop, val)
9000      Lisp_Object coding_system, prop, val;
9001 {
9002   Lisp_Object spec, attrs;
9003
9004   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9005   attrs = AREF (spec, 0);
9006   if (EQ (prop, QCmnemonic))
9007     {
9008       if (! STRINGP (val))
9009         CHECK_CHARACTER (val);
9010       CODING_ATTR_MNEMONIC (attrs) = val;
9011     }
9012   else if (EQ (prop, QCdefalut_char))
9013     {
9014       if (NILP (val))
9015         val = make_number (' ');
9016       else
9017         CHECK_CHARACTER (val);
9018       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9019     }
9020   else if (EQ (prop, QCdecode_translation_table))
9021     {
9022       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9023         CHECK_SYMBOL (val);
9024       CODING_ATTR_DECODE_TBL (attrs) = val;
9025     }
9026   else if (EQ (prop, QCencode_translation_table))
9027     {
9028       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9029         CHECK_SYMBOL (val);
9030       CODING_ATTR_ENCODE_TBL (attrs) = val;
9031     }
9032   else if (EQ (prop, QCpost_read_conversion))
9033     {
9034       CHECK_SYMBOL (val);
9035       CODING_ATTR_POST_READ (attrs) = val;
9036     }
9037   else if (EQ (prop, QCpre_write_conversion))
9038     {
9039       CHECK_SYMBOL (val);
9040       CODING_ATTR_PRE_WRITE (attrs) = val;
9041     }
9042   else if (EQ (prop, QCascii_compatible_p))
9043     {
9044       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9045     }
9046
9047   CODING_ATTR_PLIST (attrs)
9048     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9049   return val;
9050 }
9051
9052
9053 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9054        Sdefine_coding_system_alias, 2, 2, 0,
9055        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9056      (alias, coding_system)
9057      Lisp_Object alias, coding_system;
9058 {
9059   Lisp_Object spec, aliases, eol_type, val;
9060
9061   CHECK_SYMBOL (alias);
9062   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9063   aliases = AREF (spec, 1);
9064   /* ALISES should be a list of length more than zero, and the first
9065      element is a base coding system.  Append ALIAS at the tail of the
9066      list.  */
9067   while (!NILP (XCDR (aliases)))
9068     aliases = XCDR (aliases);
9069   XSETCDR (aliases, Fcons (alias, Qnil));
9070
9071   eol_type = AREF (spec, 2);
9072   if (VECTORP (eol_type))
9073     {
9074       Lisp_Object subsidiaries;
9075       int i;
9076
9077       subsidiaries = make_subsidiaries (alias);
9078       for (i = 0; i < 3; i++)
9079         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9080                                      AREF (eol_type, i));
9081     }
9082
9083   Fputhash (alias, spec, Vcoding_system_hash_table);
9084   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9085   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9086   if (NILP (val))
9087     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9088                                   Vcoding_system_alist);
9089
9090   return Qnil;
9091 }
9092
9093 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9094        1, 1, 0,
9095        doc: /* Return the base of CODING-SYSTEM.
9096 Any alias or subsidiary coding system is not a base coding system.  */)
9097   (coding_system)
9098      Lisp_Object coding_system;
9099 {
9100   Lisp_Object spec, attrs;
9101
9102   if (NILP (coding_system))
9103     return (Qno_conversion);
9104   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9105   attrs = AREF (spec, 0);
9106   return CODING_ATTR_BASE_NAME (attrs);
9107 }
9108
9109 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9110        1, 1, 0,
9111        doc: "Return the property list of CODING-SYSTEM.")
9112      (coding_system)
9113      Lisp_Object coding_system;
9114 {
9115   Lisp_Object spec, attrs;
9116
9117   if (NILP (coding_system))
9118     coding_system = Qno_conversion;
9119   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9120   attrs = AREF (spec, 0);
9121   return CODING_ATTR_PLIST (attrs);
9122 }
9123
9124
9125 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9126        1, 1, 0,
9127        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9128      (coding_system)
9129      Lisp_Object coding_system;
9130 {
9131   Lisp_Object spec;
9132
9133   if (NILP (coding_system))
9134     coding_system = Qno_conversion;
9135   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9136   return AREF (spec, 1);
9137 }
9138
9139 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9140        Scoding_system_eol_type, 1, 1, 0,
9141        doc: /* Return eol-type of CODING-SYSTEM.
9142 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9143
9144 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9145 and CR respectively.
9146
9147 A vector value indicates that a format of end-of-line should be
9148 detected automatically.  Nth element of the vector is the subsidiary
9149 coding system whose eol-type is N.  */)
9150      (coding_system)
9151      Lisp_Object coding_system;
9152 {
9153   Lisp_Object spec, eol_type;
9154   int n;
9155
9156   if (NILP (coding_system))
9157     coding_system = Qno_conversion;
9158   if (! CODING_SYSTEM_P (coding_system))
9159     return Qnil;
9160   spec = CODING_SYSTEM_SPEC (coding_system);
9161   eol_type = AREF (spec, 2);
9162   if (VECTORP (eol_type))
9163     return Fcopy_sequence (eol_type);
9164   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9165   return make_number (n);
9166 }
9167
9168 #endif /* emacs */
9169
9170 \f
9171 /*** 9. Post-amble ***/
9172
9173 void
9174 init_coding_once ()
9175 {
9176   int i;
9177
9178   for (i = 0; i < coding_category_max; i++)
9179     {
9180       coding_categories[i].id = -1;
9181       coding_priorities[i] = i;
9182     }
9183
9184   /* ISO2022 specific initialize routine.  */
9185   for (i = 0; i < 0x20; i++)
9186     iso_code_class[i] = ISO_control_0;
9187   for (i = 0x21; i < 0x7F; i++)
9188     iso_code_class[i] = ISO_graphic_plane_0;
9189   for (i = 0x80; i < 0xA0; i++)
9190     iso_code_class[i] = ISO_control_1;
9191   for (i = 0xA1; i < 0xFF; i++)
9192     iso_code_class[i] = ISO_graphic_plane_1;
9193   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9194   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9195   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9196   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9197   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9198   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9199   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9200   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9201   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9202
9203   for (i = 0; i < 256; i++)
9204     {
9205       emacs_mule_bytes[i] = 1;
9206     }
9207   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9208   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9209   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9210   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9211 }
9212
9213 #ifdef emacs
9214
9215 void
9216 syms_of_coding ()
9217 {
9218   staticpro (&Vcoding_system_hash_table);
9219   {
9220     Lisp_Object args[2];
9221     args[0] = QCtest;
9222     args[1] = Qeq;
9223     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9224   }
9225
9226   staticpro (&Vsjis_coding_system);
9227   Vsjis_coding_system = Qnil;
9228
9229   staticpro (&Vbig5_coding_system);
9230   Vbig5_coding_system = Qnil;
9231
9232   staticpro (&Vcode_conversion_reused_workbuf);
9233   Vcode_conversion_reused_workbuf = Qnil;
9234
9235   staticpro (&Vcode_conversion_workbuf_name);
9236   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9237
9238   reused_workbuf_in_use = 0;
9239
9240   DEFSYM (Qcharset, "charset");
9241   DEFSYM (Qtarget_idx, "target-idx");
9242   DEFSYM (Qcoding_system_history, "coding-system-history");
9243   Fset (Qcoding_system_history, Qnil);
9244
9245   /* Target FILENAME is the first argument.  */
9246   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9247   /* Target FILENAME is the third argument.  */
9248   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9249
9250   DEFSYM (Qcall_process, "call-process");
9251   /* Target PROGRAM is the first argument.  */
9252   Fput (Qcall_process, Qtarget_idx, make_number (0));
9253
9254   DEFSYM (Qcall_process_region, "call-process-region");
9255   /* Target PROGRAM is the third argument.  */
9256   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9257
9258   DEFSYM (Qstart_process, "start-process");
9259   /* Target PROGRAM is the third argument.  */
9260   Fput (Qstart_process, Qtarget_idx, make_number (2));
9261
9262   DEFSYM (Qopen_network_stream, "open-network-stream");
9263   /* Target SERVICE is the fourth argument.  */
9264   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9265
9266   DEFSYM (Qcoding_system, "coding-system");
9267   DEFSYM (Qcoding_aliases, "coding-aliases");
9268
9269   DEFSYM (Qeol_type, "eol-type");
9270   DEFSYM (Qunix, "unix");
9271   DEFSYM (Qdos, "dos");
9272
9273   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9274   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9275   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9276   DEFSYM (Qdefault_char, "default-char");
9277   DEFSYM (Qundecided, "undecided");
9278   DEFSYM (Qno_conversion, "no-conversion");
9279   DEFSYM (Qraw_text, "raw-text");
9280
9281   DEFSYM (Qiso_2022, "iso-2022");
9282
9283   DEFSYM (Qutf_8, "utf-8");
9284   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9285
9286   DEFSYM (Qutf_16, "utf-16");
9287   DEFSYM (Qbig, "big");
9288   DEFSYM (Qlittle, "little");
9289
9290   DEFSYM (Qshift_jis, "shift-jis");
9291   DEFSYM (Qbig5, "big5");
9292
9293   DEFSYM (Qcoding_system_p, "coding-system-p");
9294
9295   DEFSYM (Qcoding_system_error, "coding-system-error");
9296   Fput (Qcoding_system_error, Qerror_conditions,
9297         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9298   Fput (Qcoding_system_error, Qerror_message,
9299         build_string ("Invalid coding system"));
9300
9301   /* Intern this now in case it isn't already done.
9302      Setting this variable twice is harmless.
9303      But don't staticpro it here--that is done in alloc.c.  */
9304   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9305
9306   DEFSYM (Qtranslation_table, "translation-table");
9307   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9308   DEFSYM (Qtranslation_table_id, "translation-table-id");
9309   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9310   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9311
9312   DEFSYM (Qvalid_codes, "valid-codes");
9313
9314   DEFSYM (Qemacs_mule, "emacs-mule");
9315
9316   DEFSYM (QCcategory, ":category");
9317   DEFSYM (QCmnemonic, ":mnemonic");
9318   DEFSYM (QCdefalut_char, ":default-char");
9319   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9320   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9321   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9322   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9323   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9324
9325   Vcoding_category_table
9326     = Fmake_vector (make_number (coding_category_max), Qnil);
9327   staticpro (&Vcoding_category_table);
9328   /* Followings are target of code detection.  */
9329   ASET (Vcoding_category_table, coding_category_iso_7,
9330         intern ("coding-category-iso-7"));
9331   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9332         intern ("coding-category-iso-7-tight"));
9333   ASET (Vcoding_category_table, coding_category_iso_8_1,
9334         intern ("coding-category-iso-8-1"));
9335   ASET (Vcoding_category_table, coding_category_iso_8_2,
9336         intern ("coding-category-iso-8-2"));
9337   ASET (Vcoding_category_table, coding_category_iso_7_else,
9338         intern ("coding-category-iso-7-else"));
9339   ASET (Vcoding_category_table, coding_category_iso_8_else,
9340         intern ("coding-category-iso-8-else"));
9341   ASET (Vcoding_category_table, coding_category_utf_8,
9342         intern ("coding-category-utf-8"));
9343   ASET (Vcoding_category_table, coding_category_utf_16_be,
9344         intern ("coding-category-utf-16-be"));
9345   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9346         intern ("coding-category-utf-16-auto"));
9347   ASET (Vcoding_category_table, coding_category_utf_16_le,
9348         intern ("coding-category-utf-16-le"));
9349   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9350         intern ("coding-category-utf-16-be-nosig"));
9351   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9352         intern ("coding-category-utf-16-le-nosig"));
9353   ASET (Vcoding_category_table, coding_category_charset,
9354         intern ("coding-category-charset"));
9355   ASET (Vcoding_category_table, coding_category_sjis,
9356         intern ("coding-category-sjis"));
9357   ASET (Vcoding_category_table, coding_category_big5,
9358         intern ("coding-category-big5"));
9359   ASET (Vcoding_category_table, coding_category_ccl,
9360         intern ("coding-category-ccl"));
9361   ASET (Vcoding_category_table, coding_category_emacs_mule,
9362         intern ("coding-category-emacs-mule"));
9363   /* Followings are NOT target of code detection.  */
9364   ASET (Vcoding_category_table, coding_category_raw_text,
9365         intern ("coding-category-raw-text"));
9366   ASET (Vcoding_category_table, coding_category_undecided,
9367         intern ("coding-category-undecided"));
9368
9369   DEFSYM (Qinsufficient_source, "insufficient-source");
9370   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9371   DEFSYM (Qinvalid_source, "invalid-source");
9372   DEFSYM (Qinterrupted, "interrupted");
9373   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9374   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9375
9376   defsubr (&Scoding_system_p);
9377   defsubr (&Sread_coding_system);
9378   defsubr (&Sread_non_nil_coding_system);
9379   defsubr (&Scheck_coding_system);
9380   defsubr (&Sdetect_coding_region);
9381   defsubr (&Sdetect_coding_string);
9382   defsubr (&Sfind_coding_systems_region_internal);
9383   defsubr (&Sunencodable_char_position);
9384   defsubr (&Scheck_coding_systems_region);
9385   defsubr (&Sdecode_coding_region);
9386   defsubr (&Sencode_coding_region);
9387   defsubr (&Sdecode_coding_string);
9388   defsubr (&Sencode_coding_string);
9389   defsubr (&Sdecode_sjis_char);
9390   defsubr (&Sencode_sjis_char);
9391   defsubr (&Sdecode_big5_char);
9392   defsubr (&Sencode_big5_char);
9393   defsubr (&Sset_terminal_coding_system_internal);
9394   defsubr (&Sset_safe_terminal_coding_system_internal);
9395   defsubr (&Sterminal_coding_system);
9396   defsubr (&Sset_keyboard_coding_system_internal);
9397   defsubr (&Skeyboard_coding_system);
9398   defsubr (&Sfind_operation_coding_system);
9399   defsubr (&Sset_coding_system_priority);
9400   defsubr (&Sdefine_coding_system_internal);
9401   defsubr (&Sdefine_coding_system_alias);
9402   defsubr (&Scoding_system_put);
9403   defsubr (&Scoding_system_base);
9404   defsubr (&Scoding_system_plist);
9405   defsubr (&Scoding_system_aliases);
9406   defsubr (&Scoding_system_eol_type);
9407   defsubr (&Scoding_system_priority_list);
9408
9409   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9410                doc: /* List of coding systems.
9411
9412 Do not alter the value of this variable manually.  This variable should be
9413 updated by the functions `define-coding-system' and
9414 `define-coding-system-alias'.  */);
9415   Vcoding_system_list = Qnil;
9416
9417   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9418                doc: /* Alist of coding system names.
9419 Each element is one element list of coding system name.
9420 This variable is given to `completing-read' as TABLE argument.
9421
9422 Do not alter the value of this variable manually.  This variable should be
9423 updated by the functions `make-coding-system' and
9424 `define-coding-system-alias'.  */);
9425   Vcoding_system_alist = Qnil;
9426
9427   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9428                doc: /* List of coding-categories (symbols) ordered by priority.
9429
9430 On detecting a coding system, Emacs tries code detection algorithms
9431 associated with each coding-category one by one in this order.  When
9432 one algorithm agrees with a byte sequence of source text, the coding
9433 system bound to the corresponding coding-category is selected.
9434
9435 Don't modify this variable directly, but use `set-coding-priority'.  */);
9436   {
9437     int i;
9438
9439     Vcoding_category_list = Qnil;
9440     for (i = coding_category_max - 1; i >= 0; i--)
9441       Vcoding_category_list
9442         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9443                  Vcoding_category_list);
9444   }
9445
9446   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9447                doc: /* Specify the coding system for read operations.
9448 It is useful to bind this variable with `let', but do not set it globally.
9449 If the value is a coding system, it is used for decoding on read operation.
9450 If not, an appropriate element is used from one of the coding system alists:
9451 There are three such tables, `file-coding-system-alist',
9452 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9453   Vcoding_system_for_read = Qnil;
9454
9455   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9456                doc: /* Specify the coding system for write operations.
9457 Programs bind this variable with `let', but you should not set it globally.
9458 If the value is a coding system, it is used for encoding of output,
9459 when writing it to a file and when sending it to a file or subprocess.
9460
9461 If this does not specify a coding system, an appropriate element
9462 is used from one of the coding system alists:
9463 There are three such tables, `file-coding-system-alist',
9464 `process-coding-system-alist', and `network-coding-system-alist'.
9465 For output to files, if the above procedure does not specify a coding system,
9466 the value of `buffer-file-coding-system' is used.  */);
9467   Vcoding_system_for_write = Qnil;
9468
9469   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9470                doc: /*
9471 Coding system used in the latest file or process I/O.  */);
9472   Vlast_coding_system_used = Qnil;
9473
9474   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9475                doc: /*
9476 Error status of the last code conversion.
9477
9478 When an error was detected in the last code conversion, this variable
9479 is set to one of the following symbols.
9480   `insufficient-source'
9481   `inconsistent-eol'
9482   `invalid-source'
9483   `interrupted'
9484   `insufficient-memory'
9485 When no error was detected, the value doesn't change.  So, to check
9486 the error status of a code conversion by this variable, you must
9487 explicitly set this variable to nil before performing code
9488 conversion.  */);
9489   Vlast_code_conversion_error = Qnil;
9490
9491   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9492                doc: /*
9493 *Non-nil means always inhibit code conversion of end-of-line format.
9494 See info node `Coding Systems' and info node `Text and Binary' concerning
9495 such conversion.  */);
9496   inhibit_eol_conversion = 0;
9497
9498   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9499                doc: /*
9500 Non-nil means process buffer inherits coding system of process output.
9501 Bind it to t if the process output is to be treated as if it were a file
9502 read from some filesystem.  */);
9503   inherit_process_coding_system = 0;
9504
9505   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9506                doc: /*
9507 Alist to decide a coding system to use for a file I/O operation.
9508 The format is ((PATTERN . VAL) ...),
9509 where PATTERN is a regular expression matching a file name,
9510 VAL is a coding system, a cons of coding systems, or a function symbol.
9511 If VAL is a coding system, it is used for both decoding and encoding
9512 the file contents.
9513 If VAL is a cons of coding systems, the car part is used for decoding,
9514 and the cdr part is used for encoding.
9515 If VAL is a function symbol, the function must return a coding system
9516 or a cons of coding systems which are used as above.  The function gets
9517 the arguments with which `find-operation-coding-systems' was called.
9518
9519 See also the function `find-operation-coding-system'
9520 and the variable `auto-coding-alist'.  */);
9521   Vfile_coding_system_alist = Qnil;
9522
9523   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9524                doc: /*
9525 Alist to decide a coding system to use for a process I/O operation.
9526 The format is ((PATTERN . VAL) ...),
9527 where PATTERN is a regular expression matching a program name,
9528 VAL is a coding system, a cons of coding systems, or a function symbol.
9529 If VAL is a coding system, it is used for both decoding what received
9530 from the program and encoding what sent to the program.
9531 If VAL is a cons of coding systems, the car part is used for decoding,
9532 and the cdr part is used for encoding.
9533 If VAL is a function symbol, the function must return a coding system
9534 or a cons of coding systems which are used as above.
9535
9536 See also the function `find-operation-coding-system'.  */);
9537   Vprocess_coding_system_alist = Qnil;
9538
9539   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9540                doc: /*
9541 Alist to decide a coding system to use for a network I/O operation.
9542 The format is ((PATTERN . VAL) ...),
9543 where PATTERN is a regular expression matching a network service name
9544 or is a port number to connect to,
9545 VAL is a coding system, a cons of coding systems, or a function symbol.
9546 If VAL is a coding system, it is used for both decoding what received
9547 from the network stream and encoding what sent to the network stream.
9548 If VAL is a cons of coding systems, the car part is used for decoding,
9549 and the cdr part is used for encoding.
9550 If VAL is a function symbol, the function must return a coding system
9551 or a cons of coding systems which are used as above.
9552
9553 See also the function `find-operation-coding-system'.  */);
9554   Vnetwork_coding_system_alist = Qnil;
9555
9556   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9557                doc: /* Coding system to use with system messages.
9558 Also used for decoding keyboard input on X Window system.  */);
9559   Vlocale_coding_system = Qnil;
9560
9561   /* The eol mnemonics are reset in startup.el system-dependently.  */
9562   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9563                doc: /*
9564 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9565   eol_mnemonic_unix = build_string (":");
9566
9567   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9568                doc: /*
9569 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9570   eol_mnemonic_dos = build_string ("\\");
9571
9572   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9573                doc: /*
9574 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9575   eol_mnemonic_mac = build_string ("/");
9576
9577   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9578                doc: /*
9579 *String displayed in mode line when end-of-line format is not yet determined.  */);
9580   eol_mnemonic_undecided = build_string (":");
9581
9582   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9583                doc: /*
9584 *Non-nil enables character translation while encoding and decoding.  */);
9585   Venable_character_translation = Qt;
9586
9587   DEFVAR_LISP ("standard-translation-table-for-decode",
9588                &Vstandard_translation_table_for_decode,
9589                doc: /* Table for translating characters while decoding.  */);
9590   Vstandard_translation_table_for_decode = Qnil;
9591
9592   DEFVAR_LISP ("standard-translation-table-for-encode",
9593                &Vstandard_translation_table_for_encode,
9594                doc: /* Table for translating characters while encoding.  */);
9595   Vstandard_translation_table_for_encode = Qnil;
9596
9597   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9598                doc: /* Alist of charsets vs revision numbers.
9599 While encoding, if a charset (car part of an element) is found,
9600 designate it with the escape sequence identifying revision (cdr part
9601 of the element).  */);
9602   Vcharset_revision_table = Qnil;
9603
9604   DEFVAR_LISP ("default-process-coding-system",
9605                &Vdefault_process_coding_system,
9606                doc: /* Cons of coding systems used for process I/O by default.
9607 The car part is used for decoding a process output,
9608 the cdr part is used for encoding a text to be sent to a process.  */);
9609   Vdefault_process_coding_system = Qnil;
9610
9611   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9612                doc: /*
9613 Table of extra Latin codes in the range 128..159 (inclusive).
9614 This is a vector of length 256.
9615 If Nth element is non-nil, the existence of code N in a file
9616 \(or output of subprocess) doesn't prevent it to be detected as
9617 a coding system of ISO 2022 variant which has a flag
9618 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9619 or reading output of a subprocess.
9620 Only 128th through 159th elements has a meaning.  */);
9621   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9622
9623   DEFVAR_LISP ("select-safe-coding-system-function",
9624                &Vselect_safe_coding_system_function,
9625                doc: /*
9626 Function to call to select safe coding system for encoding a text.
9627
9628 If set, this function is called to force a user to select a proper
9629 coding system which can encode the text in the case that a default
9630 coding system used in each operation can't encode the text.
9631
9632 The default value is `select-safe-coding-system' (which see).  */);
9633   Vselect_safe_coding_system_function = Qnil;
9634
9635   DEFVAR_BOOL ("coding-system-require-warning",
9636                &coding_system_require_warning,
9637                doc: /* Internal use only.
9638 If non-nil, on writing a file, `select-safe-coding-system-function' is
9639 called even if `coding-system-for-write' is non-nil.  The command
9640 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9641   coding_system_require_warning = 0;
9642
9643
9644   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9645                &inhibit_iso_escape_detection,
9646                doc: /*
9647 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9648
9649 By default, on reading a file, Emacs tries to detect how the text is
9650 encoded.  This code detection is sensitive to escape sequences.  If
9651 the sequence is valid as ISO2022, the code is determined as one of
9652 the ISO2022 encodings, and the file is decoded by the corresponding
9653 coding system (e.g. `iso-2022-7bit').
9654
9655 However, there may be a case that you want to read escape sequences in
9656 a file as is.  In such a case, you can set this variable to non-nil.
9657 Then, as the code detection ignores any escape sequences, no file is
9658 detected as encoded in some ISO2022 encoding.  The result is that all
9659 escape sequences become visible in a buffer.
9660
9661 The default value is nil, and it is strongly recommended not to change
9662 it.  That is because many Emacs Lisp source files that contain
9663 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9664 in Emacs's distribution, and they won't be decoded correctly on
9665 reading if you suppress escape sequence detection.
9666
9667 The other way to read escape sequences in a file without decoding is
9668 to explicitly specify some coding system that doesn't use ISO2022's
9669 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9670   inhibit_iso_escape_detection = 0;
9671
9672   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9673                doc: /* Char table for translating self-inserting characters.
9674 This is applied to the result of input methods, not their input.  See also
9675 `keyboard-translate-table'.  */);
9676     Vtranslation_table_for_input = Qnil;
9677
9678   {
9679     Lisp_Object args[coding_arg_max];
9680     Lisp_Object plist[16];
9681     int i;
9682
9683     for (i = 0; i < coding_arg_max; i++)
9684       args[i] = Qnil;
9685
9686     plist[0] = intern (":name");
9687     plist[1] = args[coding_arg_name] = Qno_conversion;
9688     plist[2] = intern (":mnemonic");
9689     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9690     plist[4] = intern (":coding-type");
9691     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9692     plist[6] = intern (":ascii-compatible-p");
9693     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9694     plist[8] = intern (":default-char");
9695     plist[9] = args[coding_arg_default_char] = make_number (0);
9696     plist[10] = intern (":for-unibyte");
9697     plist[11] = args[coding_arg_for_unibyte] = Qt;
9698     plist[12] = intern (":docstring");
9699     plist[13] = build_string ("Do no conversion.\n\
9700 \n\
9701 When you visit a file with this coding, the file is read into a\n\
9702 unibyte buffer as is, thus each byte of a file is treated as a\n\
9703 character.");
9704     plist[14] = intern (":eol-type");
9705     plist[15] = args[coding_arg_eol_type] = Qunix;
9706     args[coding_arg_plist] = Flist (16, plist);
9707     Fdefine_coding_system_internal (coding_arg_max, args);
9708
9709     plist[1] = args[coding_arg_name] = Qundecided;
9710     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9711     plist[5] = args[coding_arg_coding_type] = Qundecided;
9712     /* This is already set.
9713        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9714     plist[8] = intern (":charset-list");
9715     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9716     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9717     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9718     plist[15] = args[coding_arg_eol_type] = Qnil;
9719     args[coding_arg_plist] = Flist (16, plist);
9720     Fdefine_coding_system_internal (coding_arg_max, args);
9721   }
9722
9723   setup_coding_system (Qno_conversion, &keyboard_coding);
9724   setup_coding_system (Qundecided, &terminal_coding);
9725   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9726
9727   {
9728     int i;
9729
9730     for (i = 0; i < coding_category_max; i++)
9731       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9732   }
9733 }
9734
9735 char *
9736 emacs_strerror (error_number)
9737      int error_number;
9738 {
9739   char *str;
9740
9741   synchronize_system_messages_locale ();
9742   str = strerror (error_number);
9743
9744   if (! NILP (Vlocale_coding_system))
9745     {
9746       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9747                                                       Vlocale_coding_system,
9748                                                       0);
9749       str = (char *) SDATA (dec);
9750     }
9751
9752   return str;
9753 }
9754
9755 #endif /* emacs */
9756
9757 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9758    (do not change this comment) */