src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 #ifdef emacs
 344
 345 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 346
 347 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 348
 349 /* Coding system emacs-mule and raw-text are for converting only
 350    end-of-line format.  */
 351 Lisp_Object Qemacs_mule, Qraw_text;
 352 Lisp_Object Qutf_8_emacs;
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding-system for reading files and receiving data from process.  */
 357 Lisp_Object Vcoding_system_for_read;
 358 /* Coding-system for writing files and sending data to process.  */
 359 Lisp_Object Vcoding_system_for_write;
 360 /* Coding-system actually used in the latest I/O.  */
 361 Lisp_Object Vlast_coding_system_used;
 362 /* Set to non-nil when an error is detected while code conversion.  */
 363 Lisp_Object Vlast_code_conversion_error;
 364 /* A vector of length 256 which contains information about special
 365    Latin codes (especially for dealing with Microsoft codes).  */
 366 Lisp_Object Vlatin_extra_code_table;
 367
 368 /* Flag to inhibit code conversion of end-of-line format.  */
 369 int inhibit_eol_conversion;
 370
 371 /* Flag to inhibit ISO2022 escape sequence detection.  */
 372 int inhibit_iso_escape_detection;
 373
 374 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 375 int inherit_process_coding_system;
 376
 377 /* Coding system to be used to encode text for terminal display.  */
 378 struct coding_system terminal_coding;
 379
 380 /* Coding system to be used to encode text for terminal display when
 381    terminal coding system is nil.  */
 382 struct coding_system safe_terminal_coding;
 383
 384 /* Coding system of what is sent from terminal keyboard.  */
 385 struct coding_system keyboard_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)  \
 430   ((charset_id <= (coding)->max_charset_id      \
 431     ? (coding)->safe_charsets[charset_id]       \
 432     : -1))
 433
 434
 435 #define CODING_ISO_FLAGS(coding)        \
 436   ((coding)->spec.iso_2022.flags)
 437 #define CODING_ISO_DESIGNATION(coding, reg)     \
 438   ((coding)->spec.iso_2022.current_designation[reg])
 439 #define CODING_ISO_INVOCATION(coding, plane)    \
 440   ((coding)->spec.iso_2022.current_invocation[plane])
 441 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 442   ((coding)->spec.iso_2022.single_shifting)
 443 #define CODING_ISO_BOL(coding)  \
 444   ((coding)->spec.iso_2022.bol)
 445 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 446   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 447
 448 /* Control characters of ISO2022.  */
 449                         /* code */      /* function */
 450 #define ISO_CODE_LF     0x0A            /* line-feed */
 451 #define ISO_CODE_CR     0x0D            /* carriage-return */
 452 #define ISO_CODE_SO     0x0E            /* shift-out */
 453 #define ISO_CODE_SI     0x0F            /* shift-in */
 454 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 455 #define ISO_CODE_ESC    0x1B            /* escape */
 456 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 457 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 458 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 459
 460 /* All code (1-byte) of ISO2022 is classified into one of the
 461    followings.  */
 462 enum iso_code_class_type
 463   {
 464     ISO_control_0,              /* Control codes in the range
 465                                    0x00..0x1F and 0x7F, except for the
 466                                    following 5 codes.  */
 467     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 468     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 469     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 470     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 471     ISO_control_1,              /* Control codes in the range
 472                                    0x80..0x9F, except for the
 473                                    following 3 codes.  */
 474     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 475     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 476     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 477     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 478     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 479     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 480     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 481   };
 482
 483 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 484     `iso-flags' attribute of an iso2022 coding system.  */
 485
 486 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 487    instead of the correct short-form sequence (e.g. ESC $ A).  */
 488 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 489
 490 /* If set, reset graphic planes and registers at end-of-line to the
 491    initial state.  */
 492 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 493
 494 /* If set, reset graphic planes and registers before any control
 495    characters to the initial state.  */
 496 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 497
 498 /* If set, encode by 7-bit environment.  */
 499 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 500
 501 /* If set, use locking-shift function.  */
 502 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 503
 504 /* If set, use single-shift function.  Overwrite
 505    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 506 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 507
 508 /* If set, use designation escape sequence.  */
 509 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 510
 511 /* If set, produce revision number sequence.  */
 512 #define CODING_ISO_FLAG_REVISION        0x0080
 513
 514 /* If set, produce ISO6429's direction specifying sequence.  */
 515 #define CODING_ISO_FLAG_DIRECTION       0x0100
 516
 517 /* If set, assume designation states are reset at beginning of line on
 518    output.  */
 519 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 520
 521 /* If set, designation sequence should be placed at beginning of line
 522    on output.  */
 523 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 524
 525 /* If set, do not encode unsafe charactes on output.  */
 526 #define CODING_ISO_FLAG_SAFE            0x0800
 527
 528 /* If set, extra latin codes (128..159) are accepted as a valid code
 529    on input.  */
 530 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 531
 532 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 533
 534 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 535
 536 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 537
 538 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 539
 540 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 541
 542 /* A character to be produced on output if encoding of the original
 543    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 544 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 545
 546
 547 /* UTF-16 section */
 548 #define CODING_UTF_16_BOM(coding)       \
 549   ((coding)->spec.utf_16.bom)
 550
 551 #define CODING_UTF_16_ENDIAN(coding)    \
 552   ((coding)->spec.utf_16.endian)
 553
 554 #define CODING_UTF_16_SURROGATE(coding) \
 555   ((coding)->spec.utf_16.surrogate)
 556
 557
 558 /* CCL section */
 559 #define CODING_CCL_DECODER(coding)      \
 560   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 561 #define CODING_CCL_ENCODER(coding)      \
 562   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 563 #define CODING_CCL_VALIDS(coding)                                          \
 564   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 565
 566 /* Index for each coding category in `coding_categories' */
 567
 568 enum coding_category
 569   {
 570     coding_category_iso_7,
 571     coding_category_iso_7_tight,
 572     coding_category_iso_8_1,
 573     coding_category_iso_8_2,
 574     coding_category_iso_7_else,
 575     coding_category_iso_8_else,
 576     coding_category_utf_8,
 577     coding_category_utf_16_auto,
 578     coding_category_utf_16_be,
 579     coding_category_utf_16_le,
 580     coding_category_utf_16_be_nosig,
 581     coding_category_utf_16_le_nosig,
 582     coding_category_charset,
 583     coding_category_sjis,
 584     coding_category_big5,
 585     coding_category_ccl,
 586     coding_category_emacs_mule,
 587     /* All above are targets of code detection.  */
 588     coding_category_raw_text,
 589     coding_category_undecided,
 590     coding_category_max
 591   };
 592
 593 /* Definitions of flag bits used in detect_coding_XXXX.  */
 594 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 595 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 596 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 597 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 598 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 599 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 600 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 601 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 602 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 603 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 604 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 605 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 606 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 607 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 608 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 609 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 610 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 611 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 612
 613 /* This value is returned if detect_coding_mask () find nothing other
 614    than ASCII characters.  */
 615 #define CATEGORY_MASK_ANY               \
 616   (CATEGORY_MASK_ISO_7                  \
 617    | CATEGORY_MASK_ISO_7_TIGHT          \
 618    | CATEGORY_MASK_ISO_8_1              \
 619    | CATEGORY_MASK_ISO_8_2              \
 620    | CATEGORY_MASK_ISO_7_ELSE           \
 621    | CATEGORY_MASK_ISO_8_ELSE           \
 622    | CATEGORY_MASK_UTF_8                \
 623    | CATEGORY_MASK_UTF_16_BE            \
 624    | CATEGORY_MASK_UTF_16_LE            \
 625    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 626    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 627    | CATEGORY_MASK_CHARSET              \
 628    | CATEGORY_MASK_SJIS                 \
 629    | CATEGORY_MASK_BIG5                 \
 630    | CATEGORY_MASK_CCL                  \
 631    | CATEGORY_MASK_EMACS_MULE)
 632
 633
 634 #define CATEGORY_MASK_ISO_7BIT \
 635   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 636
 637 #define CATEGORY_MASK_ISO_8BIT \
 638   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 639
 640 #define CATEGORY_MASK_ISO_ELSE \
 641   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO_ESCAPE        \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_7_ELSE           \
 647    | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO       \
 650   (  CATEGORY_MASK_ISO_7BIT     \
 651      | CATEGORY_MASK_ISO_8BIT   \
 652      | CATEGORY_MASK_ISO_ELSE)
 653
 654 #define CATEGORY_MASK_UTF_16            \
 655   (CATEGORY_MASK_UTF_16_BE              \
 656    | CATEGORY_MASK_UTF_16_LE            \
 657    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 658    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 659
 660
 661 /* List of symbols `coding-category-xxx' ordered by priority.  This
 662    variable is exposed to Emacs Lisp.  */
 663 static Lisp_Object Vcoding_category_list;
 664
 665 /* Table of coding categories (Lisp symbols).  This variable is for
 666    internal use oly.  */
 667 static Lisp_Object Vcoding_category_table;
 668
 669 /* Table of coding-categories ordered by priority.  */
 670 static enum coding_category coding_priorities[coding_category_max];
 671
 672 /* Nth element is a coding context for the coding system bound to the
 673    Nth coding category.  */
 674 static struct coding_system coding_categories[coding_category_max];
 675
 676 /*** Commonly used macros and functions ***/
 677
 678 #ifndef min
 679 #define min(a, b) ((a) < (b) ? (a) : (b))
 680 #endif
 681 #ifndef max
 682 #define max(a, b) ((a) > (b) ? (a) : (b))
 683 #endif
 684
 685 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 686   do {                                                  \
 687     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 688     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 689   } while (0)
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to `no_more_source'.  If multibytep is
 695    nonzero, and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725
 726 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 727   do {                                                  \
 728     c = *src++;                                         \
 729     if (multibytep && (c & 0x80))                       \
 730       {                                                 \
 731         if ((c & 0xFE) == 0xC0)                         \
 732           c = ((c & 1) << 6) | *src++;                  \
 733         else                                            \
 734           {                                             \
 735             src--;                                      \
 736             c = - string_char (src, &src, NULL);        \
 737             record_conversion_result                    \
 738               (coding, CODING_RESULT_INVALID_SRC);      \
 739           }                                             \
 740       }                                                 \
 741     consumed_chars++;                                   \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  The caller should
 747    assure that C is 0..127, and declare and set the variable `dst'
 748    appropriately in advance.
 749 */
 750
 751
 752 #define EMIT_ONE_ASCII_BYTE(c)  \
 753   do {                          \
 754     produced_chars++;           \
 755     *dst++ = (c);               \
 756   } while (0)
 757
 758
 759 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 760
 761 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 762   do {                                  \
 763     produced_chars += 2;                \
 764     *dst++ = (c1), *dst++ = (c2);       \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 770    nonzero, store in an appropriate multibyte from.  The caller should
 771    declare and set the variables `dst' and `multibytep' appropriately
 772    in advance.  */
 773
 774 #define EMIT_ONE_BYTE(c)                \
 775   do {                                  \
 776     produced_chars++;                   \
 777     if (multibytep)                     \
 778       {                                 \
 779         int ch = (c);                   \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       *dst++ = (c);                     \
 786   } while (0)
 787
 788
 789 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 790
 791 #define EMIT_TWO_BYTES(c1, c2)          \
 792   do {                                  \
 793     produced_chars += 2;                \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch;                         \
 797                                         \
 798         ch = (c1);                      \
 799         if (ch >= 0x80)                 \
 800           ch = BYTE8_TO_CHAR (ch);      \
 801         CHAR_STRING_ADVANCE (ch, dst);  \
 802         ch = (c2);                      \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       {                                 \
 809         *dst++ = (c1);                  \
 810         *dst++ = (c2);                  \
 811       }                                 \
 812   } while (0)
 813
 814
 815 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 816   do {                                  \
 817     EMIT_ONE_BYTE (c1);                 \
 818     EMIT_TWO_BYTES (c2, c3);            \
 819   } while (0)
 820
 821
 822 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 823   do {                                          \
 824     EMIT_TWO_BYTES (c1, c2);                    \
 825     EMIT_TWO_BYTES (c3, c4);                    \
 826   } while (0)
 827
 828
 829 /* Prototypes for static functions.  */
 830 static void record_conversion_result P_ ((struct coding_system *coding,
 831                                           enum coding_result_code result));
 832 static int detect_coding_utf_8 P_ ((struct coding_system *,
 833                                     struct coding_detection_info *info));
 834 static void decode_coding_utf_8 P_ ((struct coding_system *));
 835 static int encode_coding_utf_8 P_ ((struct coding_system *));
 836
 837 static int detect_coding_utf_16 P_ ((struct coding_system *,
 838                                      struct coding_detection_info *info));
 839 static void decode_coding_utf_16 P_ ((struct coding_system *));
 840 static int encode_coding_utf_16 P_ ((struct coding_system *));
 841
 842 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 843                                        struct coding_detection_info *info));
 844 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 845 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 846
 847 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 848                                          struct coding_detection_info *info));
 849 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 850 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 851
 852 static int detect_coding_sjis P_ ((struct coding_system *,
 853                                    struct coding_detection_info *info));
 854 static void decode_coding_sjis P_ ((struct coding_system *));
 855 static int encode_coding_sjis P_ ((struct coding_system *));
 856
 857 static int detect_coding_big5 P_ ((struct coding_system *,
 858                                    struct coding_detection_info *info));
 859 static void decode_coding_big5 P_ ((struct coding_system *));
 860 static int encode_coding_big5 P_ ((struct coding_system *));
 861
 862 static int detect_coding_ccl P_ ((struct coding_system *,
 863                                   struct coding_detection_info *info));
 864 static void decode_coding_ccl P_ ((struct coding_system *));
 865 static int encode_coding_ccl P_ ((struct coding_system *));
 866
 867 static void decode_coding_raw_text P_ ((struct coding_system *));
 868 static int encode_coding_raw_text P_ ((struct coding_system *));
 869
 870 static void coding_set_source P_ ((struct coding_system *));
 871 static void coding_set_destination P_ ((struct coding_system *));
 872 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 873 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 874                                             EMACS_INT));
 875 static unsigned char *alloc_destination P_ ((struct coding_system *,
 876                                              EMACS_INT, unsigned char *));
 877 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 878 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 879                                                      int *, int *,
 880                                                      unsigned char *));
 881 static int detect_eol P_ ((const unsigned char *,
 882                            EMACS_INT, enum coding_category));
 883 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 884 static void decode_eol P_ ((struct coding_system *));
 885 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 886 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 887                                         int, int *, int *));
 888 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 889 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 890                                             EMACS_INT));
 891 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 892                                         EMACS_INT));
 893 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 894 static int decode_coding P_ ((struct coding_system *));
 895 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 896                                                       struct coding_system *,
 897                                                       int *, EMACS_INT *));
 898 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 899                                                   struct coding_system *,
 900                                                   int *, EMACS_INT *));
 901 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 902 static int encode_coding P_ ((struct coding_system *));
 903 static Lisp_Object make_conversion_work_buffer P_ ((int));
 904 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 905 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 906 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 907
 908 static void
 909 record_conversion_result (struct coding_system *coding,
 910                           enum coding_result_code result)
 911 {
 912   coding->result = result;
 913   switch (result)
 914     {
 915     case CODING_RESULT_INSUFFICIENT_SRC:
 916       Vlast_code_conversion_error = Qinsufficient_source;
 917       break;
 918     case CODING_RESULT_INCONSISTENT_EOL:
 919       Vlast_code_conversion_error = Qinconsistent_eol;
 920       break;
 921     case CODING_RESULT_INVALID_SRC:
 922       Vlast_code_conversion_error = Qinvalid_source;
 923       break;
 924     case CODING_RESULT_INTERRUPT:
 925       Vlast_code_conversion_error = Qinterrupted;
 926       break;
 927     case CODING_RESULT_INSUFFICIENT_MEM:
 928       Vlast_code_conversion_error = Qinsufficient_memory;
 929       break;
 930     default:
 931       Vlast_code_conversion_error = intern ("Unknown error");
 932     }
 933 }
 934
 935 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 936   do {                                                                       \
 937     charset_map_loaded = 0;                                                  \
 938     c = DECODE_CHAR (charset, code);                                         \
 939     if (charset_map_loaded)                                                  \
 940       {                                                                      \
 941         const unsigned char *orig = coding->source;                          \
 942         EMACS_INT offset;                                                    \
 943                                                                              \
 944         coding_set_source (coding);                                          \
 945         offset = coding->source - orig;                                      \
 946         src += offset;                                                       \
 947         src_base += offset;                                                  \
 948         src_end += offset;                                                   \
 949       }                                                                      \
 950   } while (0)
 951
 952
 953 #define ASSURE_DESTINATION(bytes)                               \
 954   do {                                                          \
 955     if (dst + (bytes) >= dst_end)                               \
 956       {                                                         \
 957         int more_bytes = charbuf_end - charbuf + (bytes);       \
 958                                                                 \
 959         dst = alloc_destination (coding, more_bytes, dst);      \
 960         dst_end = coding->destination + coding->dst_bytes;      \
 961       }                                                         \
 962   } while (0)
 963
 964
 965
 966 static void
 967 coding_set_source (coding)
 968      struct coding_system *coding;
 969 {
 970   if (BUFFERP (coding->src_object))
 971     {
 972       struct buffer *buf = XBUFFER (coding->src_object);
 973
 974       if (coding->src_pos < 0)
 975         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 976       else
 977         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 978     }
 979   else if (STRINGP (coding->src_object))
 980     {
 981       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 982     }
 983   else
 984     /* Otherwise, the source is C string and is never relocated
 985        automatically.  Thus we don't have to update anything.  */
 986     ;
 987 }
 988
 989 static void
 990 coding_set_destination (coding)
 991      struct coding_system *coding;
 992 {
 993   if (BUFFERP (coding->dst_object))
 994     {
 995       if (coding->src_pos < 0)
 996         {
 997           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 998           coding->dst_bytes = (GAP_END_ADDR
 999                                - (coding->src_bytes - coding->consumed)
1000                                - coding->destination);
1001         }
1002       else
1003         {
1004           /* We are sure that coding->dst_pos_byte is before the gap
1005              of the buffer. */
1006           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1007                                  + coding->dst_pos_byte - 1);
1008           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1009                                - coding->destination);
1010         }
1011     }
1012   else
1013     /* Otherwise, the destination is C string and is never relocated
1014        automatically.  Thus we don't have to update anything.  */
1015     ;
1016 }
1017
1018
1019 static void
1020 coding_alloc_by_realloc (coding, bytes)
1021      struct coding_system *coding;
1022      EMACS_INT bytes;
1023 {
1024   coding->destination = (unsigned char *) xrealloc (coding->destination,
1025                                                     coding->dst_bytes + bytes);
1026   coding->dst_bytes += bytes;
1027 }
1028
1029 static void
1030 coding_alloc_by_making_gap (coding, bytes)
1031      struct coding_system *coding;
1032      EMACS_INT bytes;
1033 {
1034   if (BUFFERP (coding->dst_object)
1035       && EQ (coding->src_object, coding->dst_object))
1036     {
1037       EMACS_INT add = coding->src_bytes - coding->consumed;
1038
1039       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042     }
1043   else
1044     {
1045       Lisp_Object this_buffer;
1046
1047       this_buffer = Fcurrent_buffer ();
1048       set_buffer_internal (XBUFFER (coding->dst_object));
1049       make_gap (bytes);
1050       set_buffer_internal (XBUFFER (this_buffer));
1051     }
1052 }
1053
1054
1055 static unsigned char *
1056 alloc_destination (coding, nbytes, dst)
1057      struct coding_system *coding;
1058      EMACS_INT nbytes;
1059      unsigned char *dst;
1060 {
1061   EMACS_INT offset = dst - coding->destination;
1062
1063   if (BUFFERP (coding->dst_object))
1064     coding_alloc_by_making_gap (coding, nbytes);
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1068   coding_set_destination (coding);
1069   dst = coding->destination + offset;
1070   return dst;
1071 }
1072
1073 /** Macros for annotations.  */
1074
1075 /* Maximum length of annotation data (sum of annotations for
1076    composition and charset).  */
1077 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1091    METHOD is one of enum composition_method.
1092    Optionnal COMPOSITION-COMPONENTS are characters and composition
1093    rules.
1094
1095    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1096    follows.  */
1097
1098 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1099   do {                                                  \
1100     *(buf)++ = -(len);                                  \
1101     *(buf)++ = (mask);                                  \
1102     *(buf)++ = (nchars);                                \
1103     coding->annotated = 1;                              \
1104   } while (0);
1105
1106 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1107   do {                                                                      \
1108     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1109     *buf++ = method;                                                        \
1110   } while (0)
1111
1112
1113 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1114   do {                                                                  \
1115     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1116     *buf++ = id;                                                        \
1117   } while (0)
1118
1119 \f
1120 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1121
1122
1123
1124 \f
1125 /*** 3. UTF-8 ***/
1126
1127 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1128    Check if a text is encoded in UTF-8.  If it is, return 1, else
1129    return 0.  */
1130
1131 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1132 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1133 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1134 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1135 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1136 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1137
1138 static int
1139 detect_coding_utf_8 (coding, detect_info)
1140      struct coding_system *coding;
1141      struct coding_detection_info *detect_info;
1142 {
1143   const unsigned char *src = coding->source, *src_base;
1144   const unsigned char *src_end = coding->source + coding->src_bytes;
1145   int multibytep = coding->src_multibyte;
1146   int consumed_chars = 0;
1147   int found = 0;
1148
1149   detect_info->checked |= CATEGORY_MASK_UTF_8;
1150   /* A coding system of this category is always ASCII compatible.  */
1151   src += coding->head_ascii;
1152
1153   while (1)
1154     {
1155       int c, c1, c2, c3, c4;
1156
1157       src_base = src;
1158       ONE_MORE_BYTE (c);
1159       if (c < 0 || UTF_8_1_OCTET_P (c))
1160         continue;
1161       ONE_MORE_BYTE (c1);
1162       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1163         break;
1164       if (UTF_8_2_OCTET_LEADING_P (c))
1165         {
1166           found = CATEGORY_MASK_UTF_8;
1167           continue;
1168         }
1169       ONE_MORE_BYTE (c2);
1170       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1171         break;
1172       if (UTF_8_3_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c3);
1178       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1179         break;
1180       if (UTF_8_4_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c4);
1186       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1187         break;
1188       if (UTF_8_5_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       break;
1194     }
1195   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196   return 0;
1197
1198  no_more_source:
1199   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1200     {
1201       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202       return 0;
1203     }
1204   detect_info->found |= found;
1205   return 1;
1206 }
1207
1208
1209 static void
1210 decode_coding_utf_8 (coding)
1211      struct coding_system *coding;
1212 {
1213   const unsigned char *src = coding->source + coding->consumed;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   const unsigned char *src_base;
1216   int *charbuf = coding->charbuf + coding->charbuf_used;
1217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1218   int consumed_chars = 0, consumed_chars_base;
1219   int multibytep = coding->src_multibyte;
1220   Lisp_Object attr, charset_list;
1221
1222   CODING_GET_INFO (coding, attr, charset_list);
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4, c5;
1227
1228       src_base = src;
1229       consumed_chars_base = consumed_chars;
1230
1231       if (charbuf >= charbuf_end)
1232         break;
1233
1234       ONE_MORE_BYTE (c1);
1235       if (c1 < 0)
1236         {
1237           c = - c1;
1238         }
1239       else if (UTF_8_1_OCTET_P(c1))
1240         {
1241           c = c1;
1242         }
1243       else
1244         {
1245           ONE_MORE_BYTE (c2);
1246           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1247             goto invalid_code;
1248           if (UTF_8_2_OCTET_LEADING_P (c1))
1249             {
1250               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1251               /* Reject overlong sequences here and below.  Encoders
1252                  producing them are incorrect, they can be misleading,
1253                  and they mess up read/write invariance.  */
1254               if (c < 128)
1255                 goto invalid_code;
1256             }
1257           else
1258             {
1259               ONE_MORE_BYTE (c3);
1260               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1261                 goto invalid_code;
1262               if (UTF_8_3_OCTET_LEADING_P (c1))
1263                 {
1264                   c = (((c1 & 0xF) << 12)
1265                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1266                   if (c < 0x800
1267                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1268                     goto invalid_code;
1269                 }
1270               else
1271                 {
1272                   ONE_MORE_BYTE (c4);
1273                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274                     goto invalid_code;
1275                   if (UTF_8_4_OCTET_LEADING_P (c1))
1276                     {
1277                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1278                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1279                     if (c < 0x10000)
1280                       goto invalid_code;
1281                     }
1282                   else
1283                     {
1284                       ONE_MORE_BYTE (c5);
1285                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1286                         goto invalid_code;
1287                       if (UTF_8_5_OCTET_LEADING_P (c1))
1288                         {
1289                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1290                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1291                                | (c5 & 0x3F));
1292                           if ((c > MAX_CHAR) || (c < 0x200000))
1293                             goto invalid_code;
1294                         }
1295                       else
1296                         goto invalid_code;
1297                     }
1298                 }
1299             }
1300         }
1301
1302       *charbuf++ = c;
1303       continue;
1304
1305     invalid_code:
1306       src = src_base;
1307       consumed_chars = consumed_chars_base;
1308       ONE_MORE_BYTE (c);
1309       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1310       coding->errors++;
1311     }
1312
1313  no_more_source:
1314   coding->consumed_char += consumed_chars_base;
1315   coding->consumed = src_base - coding->source;
1316   coding->charbuf_used = charbuf - coding->charbuf;
1317 }
1318
1319
1320 static int
1321 encode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   int multibytep = coding->dst_multibyte;
1325   int *charbuf = coding->charbuf;
1326   int *charbuf_end = charbuf + coding->charbuf_used;
1327   unsigned char *dst = coding->destination + coding->produced;
1328   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1329   int produced_chars = 0;
1330   int c;
1331
1332   if (multibytep)
1333     {
1334       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1335
1336       while (charbuf < charbuf_end)
1337         {
1338           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1339
1340           ASSURE_DESTINATION (safe_room);
1341           c = *charbuf++;
1342           if (CHAR_BYTE8_P (c))
1343             {
1344               c = CHAR_TO_BYTE8 (c);
1345               EMIT_ONE_BYTE (c);
1346             }
1347           else
1348             {
1349               CHAR_STRING_ADVANCE (c, pend);
1350               for (p = str; p < pend; p++)
1351                 EMIT_ONE_BYTE (*p);
1352             }
1353         }
1354     }
1355   else
1356     {
1357       int safe_room = MAX_MULTIBYTE_LENGTH;
1358
1359       while (charbuf < charbuf_end)
1360         {
1361           ASSURE_DESTINATION (safe_room);
1362           c = *charbuf++;
1363           if (CHAR_BYTE8_P (c))
1364             *dst++ = CHAR_TO_BYTE8 (c);
1365           else
1366             dst += CHAR_STRING (c, dst);
1367           produced_chars++;
1368         }
1369     }
1370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1371   coding->produced_char += produced_chars;
1372   coding->produced = dst - coding->destination;
1373   return 0;
1374 }
1375
1376
1377 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1378    Check if a text is encoded in one of UTF-16 based coding systems.
1379    If it is, return 1, else return 0.  */
1380
1381 #define UTF_16_HIGH_SURROGATE_P(val) \
1382   (((val) & 0xFC00) == 0xD800)
1383
1384 #define UTF_16_LOW_SURROGATE_P(val) \
1385   (((val) & 0xFC00) == 0xDC00)
1386
1387 #define UTF_16_INVALID_P(val)   \
1388   (((val) == 0xFFFE)            \
1389    || ((val) == 0xFFFF)         \
1390    || UTF_16_LOW_SURROGATE_P (val))
1391
1392
1393 static int
1394 detect_coding_utf_16 (coding, detect_info)
1395      struct coding_system *coding;
1396      struct coding_detection_info *detect_info;
1397 {
1398   const unsigned char *src = coding->source, *src_base = src;
1399   const unsigned char *src_end = coding->source + coding->src_bytes;
1400   int multibytep = coding->src_multibyte;
1401   int consumed_chars = 0;
1402   int c1, c2;
1403
1404   detect_info->checked |= CATEGORY_MASK_UTF_16;
1405   if (coding->mode & CODING_MODE_LAST_BLOCK
1406       && (coding->src_chars & 1))
1407     {
1408       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1409       return 0;
1410     }
1411
1412   ONE_MORE_BYTE (c1);
1413   ONE_MORE_BYTE (c2);
1414   if ((c1 == 0xFF) && (c2 == 0xFE))
1415     {
1416       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1417                              | CATEGORY_MASK_UTF_16_AUTO);
1418       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1419                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1420                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1421     }
1422   else if ((c1 == 0xFE) && (c2 == 0xFF))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if (c1 >= 0 && c2 >= 0)
1431     {
1432       detect_info->rejected
1433         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1434     }
1435  no_more_source:
1436   return 1;
1437 }
1438
1439 static void
1440 decode_coding_utf_16 (coding)
1441      struct coding_system *coding;
1442 {
1443   const unsigned char *src = coding->source + coding->consumed;
1444   const unsigned char *src_end = coding->source + coding->src_bytes;
1445   const unsigned char *src_base;
1446   int *charbuf = coding->charbuf + coding->charbuf_used;
1447   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1448   int consumed_chars = 0, consumed_chars_base;
1449   int multibytep = coding->src_multibyte;
1450   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1451   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1452   int surrogate = CODING_UTF_16_SURROGATE (coding);
1453   Lisp_Object attr, charset_list;
1454
1455   CODING_GET_INFO (coding, attr, charset_list);
1456
1457   if (bom == utf_16_with_bom)
1458     {
1459       int c, c1, c2;
1460
1461       src_base = src;
1462       ONE_MORE_BYTE (c1);
1463       ONE_MORE_BYTE (c2);
1464       c = (c1 << 8) | c2;
1465
1466       if (endian == utf_16_big_endian
1467           ? c != 0xFEFF : c != 0xFFFE)
1468         {
1469           /* The first two bytes are not BOM.  Treat them as bytes
1470              for a normal character.  */
1471           src = src_base;
1472           coding->errors++;
1473         }
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476   else if (bom == utf_16_detect_bom)
1477     {
1478       /* We have already tried to detect BOM and failed in
1479          detect_coding.  */
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482
1483   while (1)
1484     {
1485       int c, c1, c2;
1486
1487       src_base = src;
1488       consumed_chars_base = consumed_chars;
1489
1490       if (charbuf + 2 >= charbuf_end)
1491         break;
1492
1493       ONE_MORE_BYTE (c1);
1494       if (c1 < 0)
1495         {
1496           *charbuf++ = -c1;
1497           continue;
1498         }
1499       ONE_MORE_BYTE (c2);
1500       if (c2 < 0)
1501         {
1502           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1503           *charbuf++ = -c2;
1504           continue;
1505         }
1506       c = (endian == utf_16_big_endian
1507            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1508       if (surrogate)
1509         {
1510           if (! UTF_16_LOW_SURROGATE_P (c))
1511             {
1512               if (endian == utf_16_big_endian)
1513                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1514               else
1515                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1516               *charbuf++ = c1;
1517               *charbuf++ = c2;
1518               coding->errors++;
1519               if (UTF_16_HIGH_SURROGATE_P (c))
1520                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1521               else
1522                 *charbuf++ = c;
1523             }
1524           else
1525             {
1526               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1527               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1528               *charbuf++ = 0x10000 + c;
1529             }
1530         }
1531       else
1532         {
1533           if (UTF_16_HIGH_SURROGATE_P (c))
1534             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1535           else
1536             *charbuf++ = c;
1537         }
1538     }
1539
1540  no_more_source:
1541   coding->consumed_char += consumed_chars_base;
1542   coding->consumed = src_base - coding->source;
1543   coding->charbuf_used = charbuf - coding->charbuf;
1544 }
1545
1546 static int
1547 encode_coding_utf_16 (coding)
1548      struct coding_system *coding;
1549 {
1550   int multibytep = coding->dst_multibyte;
1551   int *charbuf = coding->charbuf;
1552   int *charbuf_end = charbuf + coding->charbuf_used;
1553   unsigned char *dst = coding->destination + coding->produced;
1554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1555   int safe_room = 8;
1556   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1557   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1558   int produced_chars = 0;
1559   Lisp_Object attrs, charset_list;
1560   int c;
1561
1562   CODING_GET_INFO (coding, attrs, charset_list);
1563
1564   if (bom != utf_16_without_bom)
1565     {
1566       ASSURE_DESTINATION (safe_room);
1567       if (big_endian)
1568         EMIT_TWO_BYTES (0xFE, 0xFF);
1569       else
1570         EMIT_TWO_BYTES (0xFF, 0xFE);
1571       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1572     }
1573
1574   while (charbuf < charbuf_end)
1575     {
1576       ASSURE_DESTINATION (safe_room);
1577       c = *charbuf++;
1578       if (c >= MAX_UNICODE_CHAR)
1579         c = coding->default_char;
1580
1581       if (c < 0x10000)
1582         {
1583           if (big_endian)
1584             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1585           else
1586             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1587         }
1588       else
1589         {
1590           int c1, c2;
1591
1592           c -= 0x10000;
1593           c1 = (c >> 10) + 0xD800;
1594           c2 = (c & 0x3FF) + 0xDC00;
1595           if (big_endian)
1596             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1597           else
1598             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1599         }
1600     }
1601   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1602   coding->produced = dst - coding->destination;
1603   coding->produced_char += produced_chars;
1604   return 0;
1605 }
1606
1607 \f
1608 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1609
1610 /* Emacs' internal format for representation of multiple character
1611    sets is a kind of multi-byte encoding, i.e. characters are
1612    represented by variable-length sequences of one-byte codes.
1613
1614    ASCII characters and control characters (e.g. `tab', `newline') are
1615    represented by one-byte sequences which are their ASCII codes, in
1616    the range 0x00 through 0x7F.
1617
1618    8-bit characters of the range 0x80..0x9F are represented by
1619    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1620    code + 0x20).
1621
1622    8-bit characters of the range 0xA0..0xFF are represented by
1623    one-byte sequences which are their 8-bit code.
1624
1625    The other characters are represented by a sequence of `base
1626    leading-code', optional `extended leading-code', and one or two
1627    `position-code's.  The length of the sequence is determined by the
1628    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1629    whereas extended leading-code and position-code take the range 0xA0
1630    through 0xFF.  See `charset.h' for more details about leading-code
1631    and position-code.
1632
1633    --- CODE RANGE of Emacs' internal format ---
1634    character set        range
1635    -------------        -----
1636    ascii                0x00..0x7F
1637    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1638    eight-bit-graphic    0xA0..0xBF
1639    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1640    ---------------------------------------------
1641
1642    As this is the internal character representation, the format is
1643    usually not used externally (i.e. in a file or in a data sent to a
1644    process).  But, it is possible to have a text externally in this
1645    format (i.e. by encoding by the coding system `emacs-mule').
1646
1647    In that case, a sequence of one-byte codes has a slightly different
1648    form.
1649
1650    At first, all characters in eight-bit-control are represented by
1651    one-byte sequences which are their 8-bit code.
1652
1653    Next, character composition data are represented by the byte
1654    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1655    where,
1656         METHOD is 0xF0 plus one of composition method (enum
1657         composition_method),
1658
1659         BYTES is 0xA0 plus a byte length of this composition data,
1660
1661         CHARS is 0x20 plus a number of characters composed by this
1662         data,
1663
1664         COMPONENTs are characters of multibye form or composition
1665         rules encoded by two-byte of ASCII codes.
1666
1667    In addition, for backward compatibility, the following formats are
1668    also recognized as composition data on decoding.
1669
1670    0x80 MSEQ ...
1671    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1672
1673    Here,
1674         MSEQ is a multibyte form but in these special format:
1675           ASCII: 0xA0 ASCII_CODE+0x80,
1676           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1677         RULE is a one byte code of the range 0xA0..0xF0 that
1678         represents a composition rule.
1679   */
1680
1681 char emacs_mule_bytes[256];
1682
1683 int
1684 emacs_mule_char (coding, src, nbytes, nchars, id)
1685      struct coding_system *coding;
1686      const unsigned char *src;
1687      int *nbytes, *nchars, *id;
1688 {
1689   const unsigned char *src_end = coding->source + coding->src_bytes;
1690   const unsigned char *src_base = src;
1691   int multibytep = coding->src_multibyte;
1692   struct charset *charset;
1693   unsigned code;
1694   int c;
1695   int consumed_chars = 0;
1696
1697   ONE_MORE_BYTE (c);
1698   if (c < 0)
1699     {
1700       c = -c;
1701       charset = emacs_mule_charset[0];
1702     }
1703   else
1704     {
1705       if (c >= 0xA0)
1706         {
1707           /* Old style component character of a compostion.  */
1708           if (c == 0xA0)
1709             {
1710               ONE_MORE_BYTE (c);
1711               c -= 0x80;
1712             }
1713           else
1714             c -= 0x20;
1715         }
1716
1717       switch (emacs_mule_bytes[c])
1718         {
1719         case 2:
1720           if (! (charset = emacs_mule_charset[c]))
1721             goto invalid_code;
1722           ONE_MORE_BYTE (c);
1723           if (c < 0xA0)
1724             goto invalid_code;
1725           code = c & 0x7F;
1726           break;
1727
1728         case 3:
1729           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1730               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1731             {
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1734                 goto invalid_code;
1735               ONE_MORE_BYTE (c);
1736               if (c < 0xA0)
1737                 goto invalid_code;
1738               code = c & 0x7F;
1739             }
1740           else
1741             {
1742               if (! (charset = emacs_mule_charset[c]))
1743                 goto invalid_code;
1744               ONE_MORE_BYTE (c);
1745               if (c < 0xA0)
1746                 goto invalid_code;
1747               code = (c & 0x7F) << 8;
1748               ONE_MORE_BYTE (c);
1749               if (c < 0xA0)
1750                 goto invalid_code;
1751               code |= c & 0x7F;
1752             }
1753           break;
1754
1755         case 4:
1756           ONE_MORE_BYTE (c);
1757           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1758             goto invalid_code;
1759           ONE_MORE_BYTE (c);
1760           if (c < 0xA0)
1761             goto invalid_code;
1762           code = (c & 0x7F) << 8;
1763           ONE_MORE_BYTE (c);
1764           if (c < 0xA0)
1765             goto invalid_code;
1766           code |= c & 0x7F;
1767           break;
1768
1769         case 1:
1770           code = c;
1771           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1772                                      ? charset_ascii : charset_eight_bit);
1773           break;
1774
1775         default:
1776           abort ();
1777         }
1778       c = DECODE_CHAR (charset, code);
1779       if (c < 0)
1780         goto invalid_code;
1781     }
1782   *nbytes = src - src_base;
1783   *nchars = consumed_chars;
1784   if (id)
1785     *id = charset->id;
1786   return c;
1787
1788  no_more_source:
1789   return -2;
1790
1791  invalid_code:
1792   return -1;
1793 }
1794
1795
1796 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1797    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1798    else return 0.  */
1799
1800 static int
1801 detect_coding_emacs_mule (coding, detect_info)
1802      struct coding_system *coding;
1803      struct coding_detection_info *detect_info;
1804 {
1805   const unsigned char *src = coding->source, *src_base;
1806   const unsigned char *src_end = coding->source + coding->src_bytes;
1807   int multibytep = coding->src_multibyte;
1808   int consumed_chars = 0;
1809   int c;
1810   int found = 0;
1811
1812   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1813   /* A coding system of this category is always ASCII compatible.  */
1814   src += coding->head_ascii;
1815
1816   while (1)
1817     {
1818       src_base = src;
1819       ONE_MORE_BYTE (c);
1820       if (c < 0)
1821         continue;
1822       if (c == 0x80)
1823         {
1824           /* Perhaps the start of composite character.  We simple skip
1825              it because analyzing it is too heavy for detecting.  But,
1826              at least, we check that the composite character
1827              constitues of more than 4 bytes.  */
1828           const unsigned char *src_base;
1829
1830         repeat:
1831           src_base = src;
1832           do
1833             {
1834               ONE_MORE_BYTE (c);
1835             }
1836           while (c >= 0xA0);
1837
1838           if (src - src_base <= 4)
1839             break;
1840           found = CATEGORY_MASK_EMACS_MULE;
1841           if (c == 0x80)
1842             goto repeat;
1843         }
1844
1845       if (c < 0x80)
1846         {
1847           if (c < 0x20
1848               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1849             break;
1850         }
1851       else
1852         {
1853           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1854
1855           while (more_bytes > 0)
1856             {
1857               ONE_MORE_BYTE (c);
1858               if (c < 0xA0)
1859                 {
1860                   src--;        /* Unread the last byte.  */
1861                   break;
1862                 }
1863               more_bytes--;
1864             }
1865           if (more_bytes != 0)
1866             break;
1867           found = CATEGORY_MASK_EMACS_MULE;
1868         }
1869     }
1870   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1871   return 0;
1872
1873  no_more_source:
1874   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1875     {
1876       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1877       return 0;
1878     }
1879   detect_info->found |= found;
1880   return 1;
1881 }
1882
1883
1884 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1885
1886 /* Decode a character represented as a component of composition
1887    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1888    update SRC to the head of next character (or an encoded composition
1889    rule).  If SRC doesn't points a composition component, set C to -1.
1890    If SRC points an invalid byte sequence, global exit by a return
1891    value 0.  */
1892
1893 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1894   if (1)                                                        \
1895     {                                                           \
1896       int c;                                                    \
1897       int nbytes, nchars;                                       \
1898                                                                 \
1899       if (src == src_end)                                       \
1900         break;                                                  \
1901       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1902       if (c < 0)                                                \
1903         {                                                       \
1904           if (c == -2)                                          \
1905             break;                                              \
1906           goto invalid_code;                                    \
1907         }                                                       \
1908       *buf++ = c;                                               \
1909       src += nbytes;                                            \
1910       consumed_chars += nchars;                                 \
1911     }                                                           \
1912   else
1913
1914
1915 /* Decode a composition rule represented as a component of composition
1916    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1917    and increment BUF.  If SRC points an invalid byte sequence, set C
1918    to -1.  */
1919
1920 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1921   do {                                                  \
1922     int c, gref, nref;                                  \
1923                                                         \
1924     if (src >= src_end)                                 \
1925       goto invalid_code;                                \
1926     ONE_MORE_BYTE_NO_CHECK (c);                         \
1927     c -= 0xA0;                                          \
1928     if (c < 0 || c >= 81)                               \
1929       goto invalid_code;                                \
1930                                                         \
1931     gref = c / 9, nref = c % 9;                         \
1932     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1933   } while (0)
1934
1935
1936 /* Decode a composition rule represented as a component of composition
1937    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1938    and increment BUF.  If SRC points an invalid byte sequence, set C
1939    to -1.  */
1940
1941 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1942   do {                                                  \
1943     int gref, nref;                                     \
1944                                                         \
1945     if (src + 1>= src_end)                              \
1946       goto invalid_code;                                \
1947     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1948     gref -= 0x20;                                       \
1949     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1950     nref -= 0x20;                                       \
1951     if (gref < 0 || gref >= 81                          \
1952         || nref < 0 || nref >= 81)                      \
1953       goto invalid_code;                                \
1954     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1955   } while (0)
1956
1957
1958 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1959   do {                                                                  \
1960     /* Emacs 21 style format.  The first three bytes at SRC are         \
1961        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1962        the byte length of this composition information, CHARS is the    \
1963        number of characters composed by this composition.  */           \
1964     enum composition_method method = c - 0xF2;                          \
1965     int *charbuf_base = charbuf;                                        \
1966     int consumed_chars_limit;                                           \
1967     int nbytes, nchars;                                                 \
1968                                                                         \
1969     ONE_MORE_BYTE (c);                                                  \
1970     if (c < 0)                                                          \
1971       goto invalid_code;                                                \
1972     nbytes = c - 0xA0;                                                  \
1973     if (nbytes < 3)                                                     \
1974       goto invalid_code;                                                \
1975     ONE_MORE_BYTE (c);                                                  \
1976     if (c < 0)                                                          \
1977       goto invalid_code;                                                \
1978     nchars = c - 0xA0;                                                  \
1979     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1980     consumed_chars_limit = consumed_chars_base + nbytes;                \
1981     if (method != COMPOSITION_RELATIVE)                                 \
1982       {                                                                 \
1983         int i = 0;                                                      \
1984         while (consumed_chars < consumed_chars_limit)                   \
1985           {                                                             \
1986             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1987               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1988             else                                                        \
1989               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1990             i++;                                                        \
1991           }                                                             \
1992         if (consumed_chars < consumed_chars_limit)                      \
1993           goto invalid_code;                                            \
1994         charbuf_base[0] -= i;                                           \
1995       }                                                                 \
1996   } while (0)
1997
1998
1999 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)                    \
2000   do {                                                                  \
2001     /* Emacs 20 style format for relative composition.  */              \
2002     /* Store multibyte form of characters to be composed.  */           \
2003     enum composition_method method = COMPOSITION_RELATIVE;              \
2004     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];                 \
2005     int *buf = components;                                              \
2006     int i, j;                                                           \
2007                                                                         \
2008     src = src_base;                                                     \
2009     ONE_MORE_BYTE (c);          /* skip 0x80 */                         \
2010     for (i = 0; *src >= 0xA0 && i < MAX_COMPOSITION_COMPONENTS; i++)    \
2011       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                         \
2012     if (i < 2)                                                          \
2013       goto invalid_code;                                                \
2014     ADD_COMPOSITION_DATA (charbuf, i, method);                          \
2015     for (j = 0; j < i; j++)                                             \
2016       *charbuf++ = components[j];                                       \
2017   } while (0)
2018
2019
2020 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2021   do {                                                          \
2022     /* Emacs 20 style format for rule-base composition.  */     \
2023     /* Store multibyte form of characters to be composed.  */   \
2024     enum composition_method method = COMPOSITION_WITH_RULE;     \
2025     int *charbuf_base = charbuf;                                \
2026     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2027     int *buf = components;                                      \
2028     int i, j;                                                   \
2029                                                                 \
2030     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2031     for (i = 1; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2032       {                                                         \
2033         if (*src < 0xA0)                                        \
2034           break;                                                \
2035         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2036         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2037       }                                                         \
2038     if (i <= 1 || (buf - components) % 2 == 0)                  \
2039       goto invalid_code;                                        \
2040     if (charbuf + i + (i / 2) + 1 >= charbuf_end)               \
2041       goto no_more_source;                                      \
2042     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2043     i = i * 2 - 1;                                              \
2044     for (j = 0; j < i; j++)                                     \
2045       *charbuf++ = components[j];                               \
2046     charbuf_base[0] -= i;                                       \
2047     for (j = 0; j < i; j += 2)                                  \
2048       *charbuf++ = components[j];                               \
2049   } while (0)
2050
2051
2052 static void
2053 decode_coding_emacs_mule (coding)
2054      struct coding_system *coding;
2055 {
2056   const unsigned char *src = coding->source + coding->consumed;
2057   const unsigned char *src_end = coding->source + coding->src_bytes;
2058   const unsigned char *src_base;
2059   int *charbuf = coding->charbuf + coding->charbuf_used;
2060   int *charbuf_end
2061     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2062   int consumed_chars = 0, consumed_chars_base;
2063   int multibytep = coding->src_multibyte;
2064   Lisp_Object attrs, charset_list;
2065   int char_offset = coding->produced_char;
2066   int last_offset = char_offset;
2067   int last_id = charset_ascii;
2068
2069   CODING_GET_INFO (coding, attrs, charset_list);
2070
2071   while (1)
2072     {
2073       int c;
2074
2075       src_base = src;
2076       consumed_chars_base = consumed_chars;
2077
2078       if (charbuf >= charbuf_end)
2079         break;
2080
2081       ONE_MORE_BYTE (c);
2082       if (c < 0)
2083         {
2084           *charbuf++ = -c;
2085           char_offset++;
2086         }
2087       else if (c < 0x80)
2088         {
2089           *charbuf++ = c;
2090           char_offset++;
2091         }
2092       else if (c == 0x80)
2093         {
2094           ONE_MORE_BYTE (c);
2095           if (c < 0)
2096             goto invalid_code;
2097           if (c - 0xF2 >= COMPOSITION_RELATIVE
2098               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2099             DECODE_EMACS_MULE_21_COMPOSITION (c);
2100           else if (c < 0xC0)
2101             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2102           else if (c == 0xFF)
2103             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2104           else
2105             goto invalid_code;
2106         }
2107       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2108         {
2109           int nbytes, nchars;
2110           int id;
2111
2112           src = src_base;
2113           consumed_chars = consumed_chars_base;
2114           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2115           if (c < 0)
2116             {
2117               if (c == -2)
2118                 break;
2119               goto invalid_code;
2120             }
2121           if (last_id != id)
2122             {
2123               if (last_id != charset_ascii)
2124                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2125               last_id = id;
2126               last_offset = char_offset;
2127             }
2128           *charbuf++ = c;
2129           src += nbytes;
2130           consumed_chars += nchars;
2131           char_offset++;
2132         }
2133       else
2134         goto invalid_code;
2135       continue;
2136
2137     invalid_code:
2138       src = src_base;
2139       consumed_chars = consumed_chars_base;
2140       ONE_MORE_BYTE (c);
2141       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2142       char_offset++;
2143       coding->errors++;
2144     }
2145
2146  no_more_source:
2147   if (last_id != charset_ascii)
2148     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2149   coding->consumed_char += consumed_chars_base;
2150   coding->consumed = src_base - coding->source;
2151   coding->charbuf_used = charbuf - coding->charbuf;
2152 }
2153
2154
2155 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2156   do {                                          \
2157     if (id < 0xA0)                              \
2158       codes[0] = id, codes[1] = 0;              \
2159     else if (id < 0xE0)                         \
2160       codes[0] = 0x9A, codes[1] = id;           \
2161     else if (id < 0xF0)                         \
2162       codes[0] = 0x9B, codes[1] = id;           \
2163     else if (id < 0xF5)                         \
2164       codes[0] = 0x9C, codes[1] = id;           \
2165     else                                        \
2166       codes[0] = 0x9D, codes[1] = id;           \
2167   } while (0);
2168
2169
2170 static int
2171 encode_coding_emacs_mule (coding)
2172      struct coding_system *coding;
2173 {
2174   int multibytep = coding->dst_multibyte;
2175   int *charbuf = coding->charbuf;
2176   int *charbuf_end = charbuf + coding->charbuf_used;
2177   unsigned char *dst = coding->destination + coding->produced;
2178   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2179   int safe_room = 8;
2180   int produced_chars = 0;
2181   Lisp_Object attrs, charset_list;
2182   int c;
2183   int preferred_charset_id = -1;
2184
2185   CODING_GET_INFO (coding, attrs, charset_list);
2186   if (! EQ (charset_list, Vemacs_mule_charset_list))
2187     {
2188       CODING_ATTR_CHARSET_LIST (attrs)
2189         = charset_list = Vemacs_mule_charset_list;
2190     }
2191
2192   while (charbuf < charbuf_end)
2193     {
2194       ASSURE_DESTINATION (safe_room);
2195       c = *charbuf++;
2196
2197       if (c < 0)
2198         {
2199           /* Handle an annotation.  */
2200           switch (*charbuf)
2201             {
2202             case CODING_ANNOTATE_COMPOSITION_MASK:
2203               /* Not yet implemented.  */
2204               break;
2205             case CODING_ANNOTATE_CHARSET_MASK:
2206               preferred_charset_id = charbuf[3];
2207               if (preferred_charset_id >= 0
2208                   && NILP (Fmemq (make_number (preferred_charset_id),
2209                                   charset_list)))
2210                 preferred_charset_id = -1;
2211               break;
2212             default:
2213               abort ();
2214             }
2215           charbuf += -c - 1;
2216           continue;
2217         }
2218
2219       if (ASCII_CHAR_P (c))
2220         EMIT_ONE_ASCII_BYTE (c);
2221       else if (CHAR_BYTE8_P (c))
2222         {
2223           c = CHAR_TO_BYTE8 (c);
2224           EMIT_ONE_BYTE (c);
2225         }
2226       else
2227         {
2228           struct charset *charset;
2229           unsigned code;
2230           int dimension;
2231           int emacs_mule_id;
2232           unsigned char leading_codes[2];
2233
2234           if (preferred_charset_id >= 0)
2235             {
2236               charset = CHARSET_FROM_ID (preferred_charset_id);
2237               if (! CHAR_CHARSET_P (c, charset))
2238                 charset = char_charset (c, charset_list, NULL);
2239             }
2240           else
2241             charset = char_charset (c, charset_list, &code);
2242           if (! charset)
2243             {
2244               c = coding->default_char;
2245               if (ASCII_CHAR_P (c))
2246                 {
2247                   EMIT_ONE_ASCII_BYTE (c);
2248                   continue;
2249                 }
2250               charset = char_charset (c, charset_list, &code);
2251             }
2252           dimension = CHARSET_DIMENSION (charset);
2253           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2254           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2255           EMIT_ONE_BYTE (leading_codes[0]);
2256           if (leading_codes[1])
2257             EMIT_ONE_BYTE (leading_codes[1]);
2258           if (dimension == 1)
2259             EMIT_ONE_BYTE (code | 0x80);
2260           else
2261             {
2262               code |= 0x8080;
2263               EMIT_ONE_BYTE (code >> 8);
2264               EMIT_ONE_BYTE (code & 0xFF);
2265             }
2266         }
2267     }
2268   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2269   coding->produced_char += produced_chars;
2270   coding->produced = dst - coding->destination;
2271   return 0;
2272 }
2273
2274 \f
2275 /*** 7. ISO2022 handlers ***/
2276
2277 /* The following note describes the coding system ISO2022 briefly.
2278    Since the intention of this note is to help understand the
2279    functions in this file, some parts are NOT ACCURATE or are OVERLY
2280    SIMPLIFIED.  For thorough understanding, please refer to the
2281    original document of ISO2022.  This is equivalent to the standard
2282    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2283
2284    ISO2022 provides many mechanisms to encode several character sets
2285    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2286    is encoded using bytes less than 128.  This may make the encoded
2287    text a little bit longer, but the text passes more easily through
2288    several types of gateway, some of which strip off the MSB (Most
2289    Significant Bit).
2290
2291    There are two kinds of character sets: control character sets and
2292    graphic character sets.  The former contain control characters such
2293    as `newline' and `escape' to provide control functions (control
2294    functions are also provided by escape sequences).  The latter
2295    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2296    two control character sets and many graphic character sets.
2297
2298    Graphic character sets are classified into one of the following
2299    four classes, according to the number of bytes (DIMENSION) and
2300    number of characters in one dimension (CHARS) of the set:
2301    - DIMENSION1_CHARS94
2302    - DIMENSION1_CHARS96
2303    - DIMENSION2_CHARS94
2304    - DIMENSION2_CHARS96
2305
2306    In addition, each character set is assigned an identification tag,
2307    unique for each set, called the "final character" (denoted as <F>
2308    hereafter).  The <F> of each character set is decided by ECMA(*)
2309    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2310    (0x30..0x3F are for private use only).
2311
2312    Note (*): ECMA = European Computer Manufacturers Association
2313
2314    Here are examples of graphic character sets [NAME(<F>)]:
2315         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2316         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2317         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2318         o DIMENSION2_CHARS96 -- none for the moment
2319
2320    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2321         C0 [0x00..0x1F] -- control character plane 0
2322         GL [0x20..0x7F] -- graphic character plane 0
2323         C1 [0x80..0x9F] -- control character plane 1
2324         GR [0xA0..0xFF] -- graphic character plane 1
2325
2326    A control character set is directly designated and invoked to C0 or
2327    C1 by an escape sequence.  The most common case is that:
2328    - ISO646's  control character set is designated/invoked to C0, and
2329    - ISO6429's control character set is designated/invoked to C1,
2330    and usually these designations/invocations are omitted in encoded
2331    text.  In a 7-bit environment, only C0 can be used, and a control
2332    character for C1 is encoded by an appropriate escape sequence to
2333    fit into the environment.  All control characters for C1 are
2334    defined to have corresponding escape sequences.
2335
2336    A graphic character set is at first designated to one of four
2337    graphic registers (G0 through G3), then these graphic registers are
2338    invoked to GL or GR.  These designations and invocations can be
2339    done independently.  The most common case is that G0 is invoked to
2340    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2341    these invocations and designations are omitted in encoded text.
2342    In a 7-bit environment, only GL can be used.
2343
2344    When a graphic character set of CHARS94 is invoked to GL, codes
2345    0x20 and 0x7F of the GL area work as control characters SPACE and
2346    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2347    be used.
2348
2349    There are two ways of invocation: locking-shift and single-shift.
2350    With locking-shift, the invocation lasts until the next different
2351    invocation, whereas with single-shift, the invocation affects the
2352    following character only and doesn't affect the locking-shift
2353    state.  Invocations are done by the following control characters or
2354    escape sequences:
2355
2356    ----------------------------------------------------------------------
2357    abbrev  function                  cntrl escape seq   description
2358    ----------------------------------------------------------------------
2359    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2360    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2361    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2362    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2363    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2364    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2365    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2366    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2367    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2368    ----------------------------------------------------------------------
2369    (*) These are not used by any known coding system.
2370
2371    Control characters for these functions are defined by macros
2372    ISO_CODE_XXX in `coding.h'.
2373
2374    Designations are done by the following escape sequences:
2375    ----------------------------------------------------------------------
2376    escape sequence      description
2377    ----------------------------------------------------------------------
2378    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2379    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2380    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2381    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2382    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2383    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2384    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2385    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2386    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2387    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2388    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2389    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2390    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2391    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2392    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2393    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2394    ----------------------------------------------------------------------
2395
2396    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2397    of dimension 1, chars 94, and final character <F>, etc...
2398
2399    Note (*): Although these designations are not allowed in ISO2022,
2400    Emacs accepts them on decoding, and produces them on encoding
2401    CHARS96 character sets in a coding system which is characterized as
2402    7-bit environment, non-locking-shift, and non-single-shift.
2403
2404    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2405    '(' must be omitted.  We refer to this as "short-form" hereafter.
2406
2407    Now you may notice that there are a lot of ways of encoding the
2408    same multilingual text in ISO2022.  Actually, there exist many
2409    coding systems such as Compound Text (used in X11's inter client
2410    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2411    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2412    localized platforms), and all of these are variants of ISO2022.
2413
2414    In addition to the above, Emacs handles two more kinds of escape
2415    sequences: ISO6429's direction specification and Emacs' private
2416    sequence for specifying character composition.
2417
2418    ISO6429's direction specification takes the following form:
2419         o CSI ']'      -- end of the current direction
2420         o CSI '0' ']'  -- end of the current direction
2421         o CSI '1' ']'  -- start of left-to-right text
2422         o CSI '2' ']'  -- start of right-to-left text
2423    The control character CSI (0x9B: control sequence introducer) is
2424    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2425
2426    Character composition specification takes the following form:
2427         o ESC '0' -- start relative composition
2428         o ESC '1' -- end composition
2429         o ESC '2' -- start rule-base composition (*)
2430         o ESC '3' -- start relative composition with alternate chars  (**)
2431         o ESC '4' -- start rule-base composition with alternate chars  (**)
2432   Since these are not standard escape sequences of any ISO standard,
2433   the use of them with these meanings is restricted to Emacs only.
2434
2435   (*) This form is used only in Emacs 20.7 and older versions,
2436   but newer versions can safely decode it.
2437   (**) This form is used only in Emacs 21.1 and newer versions,
2438   and older versions can't decode it.
2439
2440   Here's a list of example usages of these composition escape
2441   sequences (categorized by `enum composition_method').
2442
2443   COMPOSITION_RELATIVE:
2444         ESC 0 CHAR [ CHAR ] ESC 1
2445   COMPOSITION_WITH_RULE:
2446         ESC 2 CHAR [ RULE CHAR ] ESC 1
2447   COMPOSITION_WITH_ALTCHARS:
2448         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2449   COMPOSITION_WITH_RULE_ALTCHARS:
2450         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2451
2452 enum iso_code_class_type iso_code_class[256];
2453
2454 #define SAFE_CHARSET_P(coding, id)      \
2455   ((id) <= (coding)->max_charset_id     \
2456    && (coding)->safe_charsets[id] >= 0)
2457
2458
2459 #define SHIFT_OUT_OK(category)  \
2460   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2461
2462 static void
2463 setup_iso_safe_charsets (attrs)
2464      Lisp_Object attrs;
2465 {
2466   Lisp_Object charset_list, safe_charsets;
2467   Lisp_Object request;
2468   Lisp_Object reg_usage;
2469   Lisp_Object tail;
2470   int reg94, reg96;
2471   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2472   int max_charset_id;
2473
2474   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2475   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2476       && ! EQ (charset_list, Viso_2022_charset_list))
2477     {
2478       CODING_ATTR_CHARSET_LIST (attrs)
2479         = charset_list = Viso_2022_charset_list;
2480       ASET (attrs, coding_attr_safe_charsets, Qnil);
2481     }
2482
2483   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2484     return;
2485
2486   max_charset_id = 0;
2487   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2488     {
2489       int id = XINT (XCAR (tail));
2490       if (max_charset_id < id)
2491         max_charset_id = id;
2492     }
2493
2494   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2495                                 make_number (255));
2496   request = AREF (attrs, coding_attr_iso_request);
2497   reg_usage = AREF (attrs, coding_attr_iso_usage);
2498   reg94 = XINT (XCAR (reg_usage));
2499   reg96 = XINT (XCDR (reg_usage));
2500
2501   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2502     {
2503       Lisp_Object id;
2504       Lisp_Object reg;
2505       struct charset *charset;
2506
2507       id = XCAR (tail);
2508       charset = CHARSET_FROM_ID (XINT (id));
2509       reg = Fcdr (Fassq (id, request));
2510       if (! NILP (reg))
2511         SSET (safe_charsets, XINT (id), XINT (reg));
2512       else if (charset->iso_chars_96)
2513         {
2514           if (reg96 < 4)
2515             SSET (safe_charsets, XINT (id), reg96);
2516         }
2517       else
2518         {
2519           if (reg94 < 4)
2520             SSET (safe_charsets, XINT (id), reg94);
2521         }
2522     }
2523   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2524 }
2525
2526
2527 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2528    Check if a text is encoded in one of ISO-2022 based codig systems.
2529    If it is, return 1, else return 0.  */
2530
2531 static int
2532 detect_coding_iso_2022 (coding, detect_info)
2533      struct coding_system *coding;
2534      struct coding_detection_info *detect_info;
2535 {
2536   const unsigned char *src = coding->source, *src_base = src;
2537   const unsigned char *src_end = coding->source + coding->src_bytes;
2538   int multibytep = coding->src_multibyte;
2539   int single_shifting = 0;
2540   int id;
2541   int c, c1;
2542   int consumed_chars = 0;
2543   int i;
2544   int rejected = 0;
2545   int found = 0;
2546
2547   detect_info->checked |= CATEGORY_MASK_ISO;
2548
2549   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2550     {
2551       struct coding_system *this = &(coding_categories[i]);
2552       Lisp_Object attrs, val;
2553
2554       attrs = CODING_ID_ATTRS (this->id);
2555       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2556           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2557         setup_iso_safe_charsets (attrs);
2558       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2559       this->max_charset_id = SCHARS (val) - 1;
2560       this->safe_charsets = (char *) SDATA (val);
2561     }
2562
2563   /* A coding system of this category is always ASCII compatible.  */
2564   src += coding->head_ascii;
2565
2566   while (rejected != CATEGORY_MASK_ISO)
2567     {
2568       src_base = src;
2569       ONE_MORE_BYTE (c);
2570       switch (c)
2571         {
2572         case ISO_CODE_ESC:
2573           if (inhibit_iso_escape_detection)
2574             break;
2575           single_shifting = 0;
2576           ONE_MORE_BYTE (c);
2577           if (c >= '(' && c <= '/')
2578             {
2579               /* Designation sequence for a charset of dimension 1.  */
2580               ONE_MORE_BYTE (c1);
2581               if (c1 < ' ' || c1 >= 0x80
2582                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2583                 /* Invalid designation sequence.  Just ignore.  */
2584                 break;
2585             }
2586           else if (c == '$')
2587             {
2588               /* Designation sequence for a charset of dimension 2.  */
2589               ONE_MORE_BYTE (c);
2590               if (c >= '@' && c <= 'B')
2591                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2592                 id = iso_charset_table[1][0][c];
2593               else if (c >= '(' && c <= '/')
2594                 {
2595                   ONE_MORE_BYTE (c1);
2596                   if (c1 < ' ' || c1 >= 0x80
2597                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2598                     /* Invalid designation sequence.  Just ignore.  */
2599                     break;
2600                 }
2601               else
2602                 /* Invalid designation sequence.  Just ignore it.  */
2603                 break;
2604             }
2605           else if (c == 'N' || c == 'O')
2606             {
2607               /* ESC <Fe> for SS2 or SS3.  */
2608               single_shifting = 1;
2609               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2610               break;
2611             }
2612           else if (c >= '0' && c <= '4')
2613             {
2614               /* ESC <Fp> for start/end composition.  */
2615               found |= CATEGORY_MASK_ISO;
2616               break;
2617             }
2618           else
2619             {
2620               /* Invalid escape sequence.  Just ignore it.  */
2621               break;
2622             }
2623
2624           /* We found a valid designation sequence for CHARSET.  */
2625           rejected |= CATEGORY_MASK_ISO_8BIT;
2626           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2627                               id))
2628             found |= CATEGORY_MASK_ISO_7;
2629           else
2630             rejected |= CATEGORY_MASK_ISO_7;
2631           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2632                               id))
2633             found |= CATEGORY_MASK_ISO_7_TIGHT;
2634           else
2635             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2636           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2637                               id))
2638             found |= CATEGORY_MASK_ISO_7_ELSE;
2639           else
2640             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2641           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2642                               id))
2643             found |= CATEGORY_MASK_ISO_8_ELSE;
2644           else
2645             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2646           break;
2647
2648         case ISO_CODE_SO:
2649         case ISO_CODE_SI:
2650           /* Locking shift out/in.  */
2651           if (inhibit_iso_escape_detection)
2652             break;
2653           single_shifting = 0;
2654           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2655           found |= CATEGORY_MASK_ISO_ELSE;
2656           break;
2657
2658         case ISO_CODE_CSI:
2659           /* Control sequence introducer.  */
2660           single_shifting = 0;
2661           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2662           found |= CATEGORY_MASK_ISO_8_ELSE;
2663           goto check_extra_latin;
2664
2665         case ISO_CODE_SS2:
2666         case ISO_CODE_SS3:
2667           /* Single shift.   */
2668           if (inhibit_iso_escape_detection)
2669             break;
2670           single_shifting = 0;
2671           rejected |= CATEGORY_MASK_ISO_7BIT;
2672           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2673               & CODING_ISO_FLAG_SINGLE_SHIFT)
2674             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2675           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2676               & CODING_ISO_FLAG_SINGLE_SHIFT)
2677             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2678           if (single_shifting)
2679             break;
2680           goto check_extra_latin;
2681
2682         default:
2683           if (c < 0)
2684             continue;
2685           if (c < 0x80)
2686             {
2687               single_shifting = 0;
2688               break;
2689             }
2690           if (c >= 0xA0)
2691             {
2692               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2693               found |= CATEGORY_MASK_ISO_8_1;
2694               /* Check the length of succeeding codes of the range
2695                  0xA0..0FF.  If the byte length is even, we include
2696                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2697                  only when we are not single shifting.  */
2698               if (! single_shifting
2699                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2700                 {
2701                   int i = 1;
2702                   while (src < src_end)
2703                     {
2704                       ONE_MORE_BYTE (c);
2705                       if (c < 0xA0)
2706                         break;
2707                       i++;
2708                     }
2709
2710                   if (i & 1 && src < src_end)
2711                     rejected |= CATEGORY_MASK_ISO_8_2;
2712                   else
2713                     found |= CATEGORY_MASK_ISO_8_2;
2714                 }
2715               break;
2716             }
2717         check_extra_latin:
2718           single_shifting = 0;
2719           if (! VECTORP (Vlatin_extra_code_table)
2720               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2721             {
2722               rejected = CATEGORY_MASK_ISO;
2723               break;
2724             }
2725           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2726               & CODING_ISO_FLAG_LATIN_EXTRA)
2727             found |= CATEGORY_MASK_ISO_8_1;
2728           else
2729             rejected |= CATEGORY_MASK_ISO_8_1;
2730           rejected |= CATEGORY_MASK_ISO_8_2;
2731         }
2732     }
2733   detect_info->rejected |= CATEGORY_MASK_ISO;
2734   return 0;
2735
2736  no_more_source:
2737   detect_info->rejected |= rejected;
2738   detect_info->found |= (found & ~rejected);
2739   return 1;
2740 }
2741
2742
2743 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2744    escape sequence should be kept.  */
2745 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2746   do {                                                                  \
2747     int id, prev;                                                       \
2748                                                                         \
2749     if (final < '0' || final >= 128                                     \
2750         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2751         || !SAFE_CHARSET_P (coding, id))                                \
2752       {                                                                 \
2753         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2754         chars_96 = -1;                                                  \
2755         break;                                                          \
2756       }                                                                 \
2757     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2758     if (id == charset_jisx0201_roman)                                   \
2759       {                                                                 \
2760         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2761           id = charset_ascii;                                           \
2762       }                                                                 \
2763     else if (id == charset_jisx0208_1978)                               \
2764       {                                                                 \
2765         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2766           id = charset_jisx0208;                                        \
2767       }                                                                 \
2768     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2769     /* If there was an invalid designation to REG previously, and this  \
2770        designation is ASCII to REG, we should keep this designation     \
2771        sequence.  */                                                    \
2772     if (prev == -2 && id == charset_ascii)                              \
2773       chars_96 = -1;                                                    \
2774   } while (0)
2775
2776
2777 #define MAYBE_FINISH_COMPOSITION()                              \
2778   do {                                                          \
2779     int i;                                                      \
2780     if (composition_state == COMPOSING_NO)                      \
2781       break;                                                    \
2782     /* It is assured that we have enough room for producing     \
2783        characters stored in the table `components'.  */         \
2784     if (charbuf + component_idx > charbuf_end)                  \
2785       goto no_more_source;                                      \
2786     composition_state = COMPOSING_NO;                           \
2787     if (method == COMPOSITION_RELATIVE                          \
2788         || method == COMPOSITION_WITH_ALTCHARS)                 \
2789       {                                                         \
2790         for (i = 0; i < component_idx; i++)                     \
2791           *charbuf++ = components[i];                           \
2792         char_offset += component_idx;                           \
2793       }                                                         \
2794     else                                                        \
2795       {                                                         \
2796         for (i = 0; i < component_idx; i += 2)                  \
2797           *charbuf++ = components[i];                           \
2798         char_offset += (component_idx / 2) + 1;                 \
2799       }                                                         \
2800   } while (0)
2801
2802
2803 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2804    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2805    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2806    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2807    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2808   */
2809
2810 #define DECODE_COMPOSITION_START(c1)                                    \
2811   do {                                                                  \
2812     if (c1 == '0'                                                       \
2813         && composition_state == COMPOSING_COMPONENT_RULE)               \
2814       {                                                                 \
2815         component_len = component_idx;                                  \
2816         composition_state = COMPOSING_CHAR;                             \
2817       }                                                                 \
2818     else                                                                \
2819       {                                                                 \
2820         const unsigned char *p;                                         \
2821                                                                         \
2822         MAYBE_FINISH_COMPOSITION ();                                    \
2823         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2824           goto no_more_source;                                          \
2825         for (p = src; p < src_end - 1; p++)                             \
2826           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2827             break;                                                      \
2828         if (p == src_end - 1)                                           \
2829           {                                                             \
2830             /* The current composition doesn't end in the current       \
2831                source.  */                                              \
2832             record_conversion_result                                    \
2833               (coding, CODING_RESULT_INSUFFICIENT_SRC);                 \
2834             goto no_more_source;                                        \
2835           }                                                             \
2836                                                                         \
2837         /* This is surely the start of a composition.  */               \
2838         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2839                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2840                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2841                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2842         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2843                              : COMPOSING_COMPONENT_CHAR);               \
2844         component_idx = component_len = 0;                              \
2845       }                                                                 \
2846   } while (0)
2847
2848
2849 /* Handle compositoin end sequence ESC 1.  */
2850
2851 #define DECODE_COMPOSITION_END()                                        \
2852   do {                                                                  \
2853     int nchars = (component_len > 0 ? component_idx - component_len     \
2854                   : method == COMPOSITION_RELATIVE ? component_idx      \
2855                   : (component_idx + 1) / 2);                           \
2856     int i;                                                              \
2857     int *saved_charbuf = charbuf;                                       \
2858                                                                         \
2859     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2860     if (method != COMPOSITION_RELATIVE)                                 \
2861       {                                                                 \
2862         if (component_len == 0)                                         \
2863           for (i = 0; i < component_idx; i++)                           \
2864             *charbuf++ = components[i];                                 \
2865         else                                                            \
2866           for (i = 0; i < component_len; i++)                           \
2867             *charbuf++ = components[i];                                 \
2868         *saved_charbuf = saved_charbuf - charbuf;                       \
2869       }                                                                 \
2870     if (method == COMPOSITION_WITH_RULE)                                \
2871       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2872         *charbuf++ = components[i];                                     \
2873     else                                                                \
2874       for (i = component_len; i < component_idx; i++, char_offset++)    \
2875         *charbuf++ = components[i];                                     \
2876     coding->annotated = 1;                                              \
2877     composition_state = COMPOSING_NO;                                   \
2878   } while (0)
2879
2880
2881 /* Decode a composition rule from the byte C1 (and maybe one more byte
2882    from SRC) and store one encoded composition rule in
2883    coding->cmp_data.  */
2884
2885 #define DECODE_COMPOSITION_RULE(c1)                                     \
2886   do {                                                                  \
2887     (c1) -= 32;                                                         \
2888     if (c1 < 81)                /* old format (before ver.21) */        \
2889       {                                                                 \
2890         int gref = (c1) / 9;                                            \
2891         int nref = (c1) % 9;                                            \
2892         if (gref == 4) gref = 10;                                       \
2893         if (nref == 4) nref = 10;                                       \
2894         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2895       }                                                                 \
2896     else if (c1 < 93)           /* new format (after ver.21) */         \
2897       {                                                                 \
2898         ONE_MORE_BYTE (c2);                                             \
2899         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2900       }                                                                 \
2901     else                                                                \
2902       c1 = 0;                                                           \
2903   } while (0)
2904
2905
2906 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2907
2908 static void
2909 decode_coding_iso_2022 (coding)
2910      struct coding_system *coding;
2911 {
2912   const unsigned char *src = coding->source + coding->consumed;
2913   const unsigned char *src_end = coding->source + coding->src_bytes;
2914   const unsigned char *src_base;
2915   int *charbuf = coding->charbuf + coding->charbuf_used;
2916   int *charbuf_end
2917     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2918   int consumed_chars = 0, consumed_chars_base;
2919   int multibytep = coding->src_multibyte;
2920   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2921   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2922   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2923   int charset_id_2, charset_id_3;
2924   struct charset *charset;
2925   int c;
2926   /* For handling composition sequence.  */
2927 #define COMPOSING_NO                    0
2928 #define COMPOSING_CHAR                  1
2929 #define COMPOSING_RULE                  2
2930 #define COMPOSING_COMPONENT_CHAR        3
2931 #define COMPOSING_COMPONENT_RULE        4
2932
2933   int composition_state = COMPOSING_NO;
2934   enum composition_method method;
2935   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2936   int component_idx;
2937   int component_len;
2938   Lisp_Object attrs, charset_list;
2939   int char_offset = coding->produced_char;
2940   int last_offset = char_offset;
2941   int last_id = charset_ascii;
2942
2943   CODING_GET_INFO (coding, attrs, charset_list);
2944   setup_iso_safe_charsets (attrs);
2945
2946   while (1)
2947     {
2948       int c1, c2;
2949
2950       src_base = src;
2951       consumed_chars_base = consumed_chars;
2952
2953       if (charbuf >= charbuf_end)
2954         break;
2955
2956       ONE_MORE_BYTE (c1);
2957       if (c1 < 0)
2958         goto invalid_code;
2959
2960       /* We produce at most one character.  */
2961       switch (iso_code_class [c1])
2962         {
2963         case ISO_0x20_or_0x7F:
2964           if (composition_state != COMPOSING_NO)
2965             {
2966               if (composition_state == COMPOSING_RULE
2967                   || composition_state == COMPOSING_COMPONENT_RULE)
2968                 {
2969                   DECODE_COMPOSITION_RULE (c1);
2970                   components[component_idx++] = c1;
2971                   composition_state--;
2972                   continue;
2973                 }
2974             }
2975           if (charset_id_0 < 0
2976               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2977             /* This is SPACE or DEL.  */
2978             charset = CHARSET_FROM_ID (charset_ascii);
2979           else
2980             charset = CHARSET_FROM_ID (charset_id_0);
2981           break;
2982
2983         case ISO_graphic_plane_0:
2984           if (composition_state != COMPOSING_NO)
2985             {
2986               if (composition_state == COMPOSING_RULE
2987                   || composition_state == COMPOSING_COMPONENT_RULE)
2988                 {
2989                   DECODE_COMPOSITION_RULE (c1);
2990                   components[component_idx++] = c1;
2991                   composition_state--;
2992                   continue;
2993                 }
2994             }
2995           if (charset_id_0 < 0)
2996             charset = CHARSET_FROM_ID (charset_ascii);
2997           else
2998             charset = CHARSET_FROM_ID (charset_id_0);
2999           break;
3000
3001         case ISO_0xA0_or_0xFF:
3002           if (charset_id_1 < 0
3003               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
3004               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
3005             goto invalid_code;
3006           /* This is a graphic character, we fall down ... */
3007
3008         case ISO_graphic_plane_1:
3009           if (charset_id_1 < 0)
3010             goto invalid_code;
3011           charset = CHARSET_FROM_ID (charset_id_1);
3012           break;
3013
3014         case ISO_control_0:
3015           MAYBE_FINISH_COMPOSITION ();
3016           charset = CHARSET_FROM_ID (charset_ascii);
3017           break;
3018
3019         case ISO_control_1:
3020           MAYBE_FINISH_COMPOSITION ();
3021           goto invalid_code;
3022
3023         case ISO_shift_out:
3024           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3025               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3026             goto invalid_code;
3027           CODING_ISO_INVOCATION (coding, 0) = 1;
3028           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3029           continue;
3030
3031         case ISO_shift_in:
3032           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3033             goto invalid_code;
3034           CODING_ISO_INVOCATION (coding, 0) = 0;
3035           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3036           continue;
3037
3038         case ISO_single_shift_2_7:
3039         case ISO_single_shift_2:
3040           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3041             goto invalid_code;
3042           /* SS2 is handled as an escape sequence of ESC 'N' */
3043           c1 = 'N';
3044           goto label_escape_sequence;
3045
3046         case ISO_single_shift_3:
3047           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3048             goto invalid_code;
3049           /* SS2 is handled as an escape sequence of ESC 'O' */
3050           c1 = 'O';
3051           goto label_escape_sequence;
3052
3053         case ISO_control_sequence_introducer:
3054           /* CSI is handled as an escape sequence of ESC '[' ...  */
3055           c1 = '[';
3056           goto label_escape_sequence;
3057
3058         case ISO_escape:
3059           ONE_MORE_BYTE (c1);
3060         label_escape_sequence:
3061           /* Escape sequences handled here are invocation,
3062              designation, direction specification, and character
3063              composition specification.  */
3064           switch (c1)
3065             {
3066             case '&':           /* revision of following character set */
3067               ONE_MORE_BYTE (c1);
3068               if (!(c1 >= '@' && c1 <= '~'))
3069                 goto invalid_code;
3070               ONE_MORE_BYTE (c1);
3071               if (c1 != ISO_CODE_ESC)
3072                 goto invalid_code;
3073               ONE_MORE_BYTE (c1);
3074               goto label_escape_sequence;
3075
3076             case '$':           /* designation of 2-byte character set */
3077               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3078                 goto invalid_code;
3079               {
3080                 int reg, chars96;
3081
3082                 ONE_MORE_BYTE (c1);
3083                 if (c1 >= '@' && c1 <= 'B')
3084                   {     /* designation of JISX0208.1978, GB2312.1980,
3085                            or JISX0208.1980 */
3086                     reg = 0, chars96 = 0;
3087                   }
3088                 else if (c1 >= 0x28 && c1 <= 0x2B)
3089                   { /* designation of DIMENSION2_CHARS94 character set */
3090                     reg = c1 - 0x28, chars96 = 0;
3091                     ONE_MORE_BYTE (c1);
3092                   }
3093                 else if (c1 >= 0x2C && c1 <= 0x2F)
3094                   { /* designation of DIMENSION2_CHARS96 character set */
3095                     reg = c1 - 0x2C, chars96 = 1;
3096                     ONE_MORE_BYTE (c1);
3097                   }
3098                 else
3099                   goto invalid_code;
3100                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3101                 /* We must update these variables now.  */
3102                 if (reg == 0)
3103                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3104                 else if (reg == 1)
3105                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3106                 if (chars96 < 0)
3107                   goto invalid_code;
3108               }
3109               continue;
3110
3111             case 'n':           /* invocation of locking-shift-2 */
3112               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3113                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3114                 goto invalid_code;
3115               CODING_ISO_INVOCATION (coding, 0) = 2;
3116               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3117               continue;
3118
3119             case 'o':           /* invocation of locking-shift-3 */
3120               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3121                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3122                 goto invalid_code;
3123               CODING_ISO_INVOCATION (coding, 0) = 3;
3124               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3125               continue;
3126
3127             case 'N':           /* invocation of single-shift-2 */
3128               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3129                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3130                 goto invalid_code;
3131               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3132               if (charset_id_2 < 0)
3133                 charset = CHARSET_FROM_ID (charset_ascii);
3134               else
3135                 charset = CHARSET_FROM_ID (charset_id_2);
3136               ONE_MORE_BYTE (c1);
3137               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3138                 goto invalid_code;
3139               break;
3140
3141             case 'O':           /* invocation of single-shift-3 */
3142               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3143                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3144                 goto invalid_code;
3145               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3146               if (charset_id_3 < 0)
3147                 charset = CHARSET_FROM_ID (charset_ascii);
3148               else
3149                 charset = CHARSET_FROM_ID (charset_id_3);
3150               ONE_MORE_BYTE (c1);
3151               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3152                 goto invalid_code;
3153               break;
3154
3155             case '0': case '2': case '3': case '4': /* start composition */
3156               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3157                 goto invalid_code;
3158               DECODE_COMPOSITION_START (c1);
3159               continue;
3160
3161             case '1':           /* end composition */
3162               if (composition_state == COMPOSING_NO)
3163                 goto invalid_code;
3164               DECODE_COMPOSITION_END ();
3165               continue;
3166
3167             case '[':           /* specification of direction */
3168               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3169                 goto invalid_code;
3170               /* For the moment, nested direction is not supported.
3171                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3172                  left-to-right, and nozero means right-to-left.  */
3173               ONE_MORE_BYTE (c1);
3174               switch (c1)
3175                 {
3176                 case ']':       /* end of the current direction */
3177                   coding->mode &= ~CODING_MODE_DIRECTION;
3178
3179                 case '0':       /* end of the current direction */
3180                 case '1':       /* start of left-to-right direction */
3181                   ONE_MORE_BYTE (c1);
3182                   if (c1 == ']')
3183                     coding->mode &= ~CODING_MODE_DIRECTION;
3184                   else
3185                     goto invalid_code;
3186                   break;
3187
3188                 case '2':       /* start of right-to-left direction */
3189                   ONE_MORE_BYTE (c1);
3190                   if (c1 == ']')
3191                     coding->mode |= CODING_MODE_DIRECTION;
3192                   else
3193                     goto invalid_code;
3194                   break;
3195
3196                 default:
3197                   goto invalid_code;
3198                 }
3199               continue;
3200
3201             case '%':
3202               ONE_MORE_BYTE (c1);
3203               if (c1 == '/')
3204                 {
3205                   /* CTEXT extended segment:
3206                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3207                      We keep these bytes as is for the moment.
3208                      They may be decoded by post-read-conversion.  */
3209                   int dim, M, L;
3210                   int size;
3211
3212                   ONE_MORE_BYTE (dim);
3213                   ONE_MORE_BYTE (M);
3214                   ONE_MORE_BYTE (L);
3215                   size = ((M - 128) * 128) + (L - 128);
3216                   if (charbuf + 8 + size > charbuf_end)
3217                     goto break_loop;
3218                   *charbuf++ = ISO_CODE_ESC;
3219                   *charbuf++ = '%';
3220                   *charbuf++ = '/';
3221                   *charbuf++ = dim;
3222                   *charbuf++ = BYTE8_TO_CHAR (M);
3223                   *charbuf++ = BYTE8_TO_CHAR (L);
3224                   while (size-- > 0)
3225                     {
3226                       ONE_MORE_BYTE (c1);
3227                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3228                     }
3229                 }
3230               else if (c1 == 'G')
3231                 {
3232                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3233                      ESC % G --UTF-8-BYTES-- ESC % @
3234                      We keep these bytes as is for the moment.
3235                      They may be decoded by post-read-conversion.  */
3236                   int *p = charbuf;
3237
3238                   if (p + 6 > charbuf_end)
3239                     goto break_loop;
3240                   *p++ = ISO_CODE_ESC;
3241                   *p++ = '%';
3242                   *p++ = 'G';
3243                   while (p < charbuf_end)
3244                     {
3245                       ONE_MORE_BYTE (c1);
3246                       if (c1 == ISO_CODE_ESC
3247                           && src + 1 < src_end
3248                           && src[0] == '%'
3249                           && src[1] == '@')
3250                         {
3251                           src += 2;
3252                           break;
3253                         }
3254                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3255                     }
3256                   if (p + 3 > charbuf_end)
3257                     goto break_loop;
3258                   *p++ = ISO_CODE_ESC;
3259                   *p++ = '%';
3260                   *p++ = '@';
3261                   charbuf = p;
3262                 }
3263               else
3264                 goto invalid_code;
3265               continue;
3266               break;
3267
3268             default:
3269               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3270                 goto invalid_code;
3271               {
3272                 int reg, chars96;
3273
3274                 if (c1 >= 0x28 && c1 <= 0x2B)
3275                   { /* designation of DIMENSION1_CHARS94 character set */
3276                     reg = c1 - 0x28, chars96 = 0;
3277                     ONE_MORE_BYTE (c1);
3278                   }
3279                 else if (c1 >= 0x2C && c1 <= 0x2F)
3280                   { /* designation of DIMENSION1_CHARS96 character set */
3281                     reg = c1 - 0x2C, chars96 = 1;
3282                     ONE_MORE_BYTE (c1);
3283                   }
3284                 else
3285                   goto invalid_code;
3286                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3287                 /* We must update these variables now.  */
3288                 if (reg == 0)
3289                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3290                 else if (reg == 1)
3291                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3292                 if (chars96 < 0)
3293                   goto invalid_code;
3294               }
3295               continue;
3296             }
3297         }
3298
3299       if (charset->id != charset_ascii
3300           && last_id != charset->id)
3301         {
3302           if (last_id != charset_ascii)
3303             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3304           last_id = charset->id;
3305           last_offset = char_offset;
3306         }
3307
3308       /* Now we know CHARSET and 1st position code C1 of a character.
3309          Produce a decoded character while getting 2nd position code
3310          C2 if necessary.  */
3311       c1 &= 0x7F;
3312       if (CHARSET_DIMENSION (charset) > 1)
3313         {
3314           ONE_MORE_BYTE (c2);
3315           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3316             /* C2 is not in a valid range.  */
3317             goto invalid_code;
3318           c1 = (c1 << 8) | (c2 & 0x7F);
3319           if (CHARSET_DIMENSION (charset) > 2)
3320             {
3321               ONE_MORE_BYTE (c2);
3322               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3323                 /* C2 is not in a valid range.  */
3324                 goto invalid_code;
3325               c1 = (c1 << 8) | (c2 & 0x7F);
3326             }
3327         }
3328
3329       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3330       if (c < 0)
3331         {
3332           MAYBE_FINISH_COMPOSITION ();
3333           for (; src_base < src; src_base++, char_offset++)
3334             {
3335               if (ASCII_BYTE_P (*src_base))
3336                 *charbuf++ = *src_base;
3337               else
3338                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3339             }
3340         }
3341       else if (composition_state == COMPOSING_NO)
3342         {
3343           *charbuf++ = c;
3344           char_offset++;
3345         }
3346       else
3347         {
3348           components[component_idx++] = c;
3349           if (method == COMPOSITION_WITH_RULE
3350               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3351                   && composition_state == COMPOSING_COMPONENT_CHAR))
3352             composition_state++;
3353         }
3354       continue;
3355
3356     invalid_code:
3357       MAYBE_FINISH_COMPOSITION ();
3358       src = src_base;
3359       consumed_chars = consumed_chars_base;
3360       ONE_MORE_BYTE (c);
3361       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3362       char_offset++;
3363       coding->errors++;
3364       continue;
3365
3366     break_loop:
3367       break;
3368     }
3369
3370  no_more_source:
3371   if (last_id != charset_ascii)
3372     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3373   coding->consumed_char += consumed_chars_base;
3374   coding->consumed = src_base - coding->source;
3375   coding->charbuf_used = charbuf - coding->charbuf;
3376 }
3377
3378
3379 /* ISO2022 encoding stuff.  */
3380
3381 /*
3382    It is not enough to say just "ISO2022" on encoding, we have to
3383    specify more details.  In Emacs, each coding system of ISO2022
3384    variant has the following specifications:
3385         1. Initial designation to G0 thru G3.
3386         2. Allows short-form designation?
3387         3. ASCII should be designated to G0 before control characters?
3388         4. ASCII should be designated to G0 at end of line?
3389         5. 7-bit environment or 8-bit environment?
3390         6. Use locking-shift?
3391         7. Use Single-shift?
3392    And the following two are only for Japanese:
3393         8. Use ASCII in place of JIS0201-1976-Roman?
3394         9. Use JISX0208-1983 in place of JISX0208-1978?
3395    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3396    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3397    details.
3398 */
3399
3400 /* Produce codes (escape sequence) for designating CHARSET to graphic
3401    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3402    '@', 'A', or 'B' and the coding system CODING allows, produce
3403    designation sequence of short-form.  */
3404
3405 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3406   do {                                                                  \
3407     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3408     char *intermediate_char_94 = "()*+";                                \
3409     char *intermediate_char_96 = ",-./";                                \
3410     int revision = -1;                                                  \
3411     int c;                                                              \
3412                                                                         \
3413     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3414       revision = CHARSET_ISO_REVISION (charset);                        \
3415                                                                         \
3416     if (revision >= 0)                                                  \
3417       {                                                                 \
3418         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3419         EMIT_ONE_BYTE ('@' + revision);                                 \
3420       }                                                                 \
3421     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3422     if (CHARSET_DIMENSION (charset) == 1)                               \
3423       {                                                                 \
3424         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3425           c = intermediate_char_94[reg];                                \
3426         else                                                            \
3427           c = intermediate_char_96[reg];                                \
3428         EMIT_ONE_ASCII_BYTE (c);                                        \
3429       }                                                                 \
3430     else                                                                \
3431       {                                                                 \
3432         EMIT_ONE_ASCII_BYTE ('$');                                      \
3433         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3434           {                                                             \
3435             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3436                 || reg != 0                                             \
3437                 || final_char < '@' || final_char > 'B')                \
3438               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3439           }                                                             \
3440         else                                                            \
3441           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3442       }                                                                 \
3443     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3444                                                                         \
3445     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3446   } while (0)
3447
3448
3449 /* The following two macros produce codes (control character or escape
3450    sequence) for ISO2022 single-shift functions (single-shift-2 and
3451    single-shift-3).  */
3452
3453 #define ENCODE_SINGLE_SHIFT_2                                           \
3454   do {                                                                  \
3455     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3456       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3457     else                                                                \
3458       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3459     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3460   } while (0)
3461
3462
3463 #define ENCODE_SINGLE_SHIFT_3                                           \
3464   do {                                                                  \
3465     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3466       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3467     else                                                                \
3468       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3469     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3470   } while (0)
3471
3472
3473 /* The following four macros produce codes (control character or
3474    escape sequence) for ISO2022 locking-shift functions (shift-in,
3475    shift-out, locking-shift-2, and locking-shift-3).  */
3476
3477 #define ENCODE_SHIFT_IN                                 \
3478   do {                                                  \
3479     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3480     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3481   } while (0)
3482
3483
3484 #define ENCODE_SHIFT_OUT                                \
3485   do {                                                  \
3486     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3487     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3488   } while (0)
3489
3490
3491 #define ENCODE_LOCKING_SHIFT_2                          \
3492   do {                                                  \
3493     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3494     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3495   } while (0)
3496
3497
3498 #define ENCODE_LOCKING_SHIFT_3                          \
3499   do {                                                  \
3500     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3501     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3502   } while (0)
3503
3504
3505 /* Produce codes for a DIMENSION1 character whose character set is
3506    CHARSET and whose position-code is C1.  Designation and invocation
3507    sequences are also produced in advance if necessary.  */
3508
3509 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3510   do {                                                                  \
3511     int id = CHARSET_ID (charset);                                      \
3512                                                                         \
3513     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3514         && id == charset_ascii)                                         \
3515       {                                                                 \
3516         id = charset_jisx0201_roman;                                    \
3517         charset = CHARSET_FROM_ID (id);                                 \
3518       }                                                                 \
3519                                                                         \
3520     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3521       {                                                                 \
3522         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3523           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3524         else                                                            \
3525           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3526         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3527         break;                                                          \
3528       }                                                                 \
3529     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3530       {                                                                 \
3531         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3532         break;                                                          \
3533       }                                                                 \
3534     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3535       {                                                                 \
3536         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3537         break;                                                          \
3538       }                                                                 \
3539     else                                                                \
3540       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3541          must invoke it, or, at first, designate it to some graphic     \
3542          register.  Then repeat the loop to actually produce the        \
3543          character.  */                                                 \
3544       dst = encode_invocation_designation (charset, coding, dst,        \
3545                                            &produced_chars);            \
3546   } while (1)
3547
3548
3549 /* Produce codes for a DIMENSION2 character whose character set is
3550    CHARSET and whose position-codes are C1 and C2.  Designation and
3551    invocation codes are also produced in advance if necessary.  */
3552
3553 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3554   do {                                                                  \
3555     int id = CHARSET_ID (charset);                                      \
3556                                                                         \
3557     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3558         && id == charset_jisx0208)                                      \
3559       {                                                                 \
3560         id = charset_jisx0208_1978;                                     \
3561         charset = CHARSET_FROM_ID (id);                                 \
3562       }                                                                 \
3563                                                                         \
3564     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3565       {                                                                 \
3566         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3567           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3568         else                                                            \
3569           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3570         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3571         break;                                                          \
3572       }                                                                 \
3573     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3574       {                                                                 \
3575         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3576         break;                                                          \
3577       }                                                                 \
3578     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3579       {                                                                 \
3580         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3581         break;                                                          \
3582       }                                                                 \
3583     else                                                                \
3584       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3585          must invoke it, or, at first, designate it to some graphic     \
3586          register.  Then repeat the loop to actually produce the        \
3587          character.  */                                                 \
3588       dst = encode_invocation_designation (charset, coding, dst,        \
3589                                            &produced_chars);            \
3590   } while (1)
3591
3592
3593 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3594   do {                                                                     \
3595     int code = ENCODE_CHAR ((charset),(c));                                \
3596                                                                            \
3597     if (CHARSET_DIMENSION (charset) == 1)                                  \
3598       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3599     else                                                                   \
3600       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3601   } while (0)
3602
3603
3604 /* Produce designation and invocation codes at a place pointed by DST
3605    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3606    Return new DST.  */
3607
3608 unsigned char *
3609 encode_invocation_designation (charset, coding, dst, p_nchars)
3610      struct charset *charset;
3611      struct coding_system *coding;
3612      unsigned char *dst;
3613      int *p_nchars;
3614 {
3615   int multibytep = coding->dst_multibyte;
3616   int produced_chars = *p_nchars;
3617   int reg;                      /* graphic register number */
3618   int id = CHARSET_ID (charset);
3619
3620   /* At first, check designations.  */
3621   for (reg = 0; reg < 4; reg++)
3622     if (id == CODING_ISO_DESIGNATION (coding, reg))
3623       break;
3624
3625   if (reg >= 4)
3626     {
3627       /* CHARSET is not yet designated to any graphic registers.  */
3628       /* At first check the requested designation.  */
3629       reg = CODING_ISO_REQUEST (coding, id);
3630       if (reg < 0)
3631         /* Since CHARSET requests no special designation, designate it
3632            to graphic register 0.  */
3633         reg = 0;
3634
3635       ENCODE_DESIGNATION (charset, reg, coding);
3636     }
3637
3638   if (CODING_ISO_INVOCATION (coding, 0) != reg
3639       && CODING_ISO_INVOCATION (coding, 1) != reg)
3640     {
3641       /* Since the graphic register REG is not invoked to any graphic
3642          planes, invoke it to graphic plane 0.  */
3643       switch (reg)
3644         {
3645         case 0:                 /* graphic register 0 */
3646           ENCODE_SHIFT_IN;
3647           break;
3648
3649         case 1:                 /* graphic register 1 */
3650           ENCODE_SHIFT_OUT;
3651           break;
3652
3653         case 2:                 /* graphic register 2 */
3654           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3655             ENCODE_SINGLE_SHIFT_2;
3656           else
3657             ENCODE_LOCKING_SHIFT_2;
3658           break;
3659
3660         case 3:                 /* graphic register 3 */
3661           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3662             ENCODE_SINGLE_SHIFT_3;
3663           else
3664             ENCODE_LOCKING_SHIFT_3;
3665           break;
3666         }
3667     }
3668
3669   *p_nchars = produced_chars;
3670   return dst;
3671 }
3672
3673 /* The following three macros produce codes for indicating direction
3674    of text.  */
3675 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3676   do {                                                                  \
3677     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3678       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3679     else                                                                \
3680       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3681   } while (0)
3682
3683
3684 #define ENCODE_DIRECTION_R2L()                  \
3685   do {                                          \
3686     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3687     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3688   } while (0)
3689
3690
3691 #define ENCODE_DIRECTION_L2R()                  \
3692   do {                                          \
3693     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3694     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3695   } while (0)
3696
3697
3698 /* Produce codes for designation and invocation to reset the graphic
3699    planes and registers to initial state.  */
3700 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3701   do {                                                                  \
3702     int reg;                                                            \
3703     struct charset *charset;                                            \
3704                                                                         \
3705     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3706       ENCODE_SHIFT_IN;                                                  \
3707     for (reg = 0; reg < 4; reg++)                                       \
3708       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3709           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3710               != CODING_ISO_INITIAL (coding, reg)))                     \
3711         {                                                               \
3712           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3713           ENCODE_DESIGNATION (charset, reg, coding);                    \
3714         }                                                               \
3715   } while (0)
3716
3717
3718 /* Produce designation sequences of charsets in the line started from
3719    SRC to a place pointed by DST, and return updated DST.
3720
3721    If the current block ends before any end-of-line, we may fail to
3722    find all the necessary designations.  */
3723
3724 static unsigned char *
3725 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3726      struct coding_system *coding;
3727      int *charbuf, *charbuf_end;
3728      unsigned char *dst;
3729 {
3730   struct charset *charset;
3731   /* Table of charsets to be designated to each graphic register.  */
3732   int r[4];
3733   int c, found = 0, reg;
3734   int produced_chars = 0;
3735   int multibytep = coding->dst_multibyte;
3736   Lisp_Object attrs;
3737   Lisp_Object charset_list;
3738
3739   attrs = CODING_ID_ATTRS (coding->id);
3740   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3741   if (EQ (charset_list, Qiso_2022))
3742     charset_list = Viso_2022_charset_list;
3743
3744   for (reg = 0; reg < 4; reg++)
3745     r[reg] = -1;
3746
3747   while (found < 4)
3748     {
3749       int id;
3750
3751       c = *charbuf++;
3752       if (c == '\n')
3753         break;
3754       charset = char_charset (c, charset_list, NULL);
3755       id = CHARSET_ID (charset);
3756       reg = CODING_ISO_REQUEST (coding, id);
3757       if (reg >= 0 && r[reg] < 0)
3758         {
3759           found++;
3760           r[reg] = id;
3761         }
3762     }
3763
3764   if (found)
3765     {
3766       for (reg = 0; reg < 4; reg++)
3767         if (r[reg] >= 0
3768             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3769           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3770     }
3771
3772   return dst;
3773 }
3774
3775 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3776
3777 static int
3778 encode_coding_iso_2022 (coding)
3779      struct coding_system *coding;
3780 {
3781   int multibytep = coding->dst_multibyte;
3782   int *charbuf = coding->charbuf;
3783   int *charbuf_end = charbuf + coding->charbuf_used;
3784   unsigned char *dst = coding->destination + coding->produced;
3785   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3786   int safe_room = 16;
3787   int bol_designation
3788     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3789        && CODING_ISO_BOL (coding));
3790   int produced_chars = 0;
3791   Lisp_Object attrs, eol_type, charset_list;
3792   int ascii_compatible;
3793   int c;
3794   int preferred_charset_id = -1;
3795
3796   CODING_GET_INFO (coding, attrs, charset_list);
3797   eol_type = CODING_ID_EOL_TYPE (coding->id);
3798   if (VECTORP (eol_type))
3799     eol_type = Qunix;
3800
3801   setup_iso_safe_charsets (attrs);
3802   /* Charset list may have been changed.  */
3803   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3804   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3805
3806   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3807
3808   while (charbuf < charbuf_end)
3809     {
3810       ASSURE_DESTINATION (safe_room);
3811
3812       if (bol_designation)
3813         {
3814           unsigned char *dst_prev = dst;
3815
3816           /* We have to produce designation sequences if any now.  */
3817           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3818           bol_designation = 0;
3819           /* We are sure that designation sequences are all ASCII bytes.  */
3820           produced_chars += dst - dst_prev;
3821         }
3822
3823       c = *charbuf++;
3824
3825       if (c < 0)
3826         {
3827           /* Handle an annotation.  */
3828           switch (*charbuf)
3829             {
3830             case CODING_ANNOTATE_COMPOSITION_MASK:
3831               /* Not yet implemented.  */
3832               break;
3833             case CODING_ANNOTATE_CHARSET_MASK:
3834               preferred_charset_id = charbuf[2];
3835               if (preferred_charset_id >= 0
3836                   && NILP (Fmemq (make_number (preferred_charset_id),
3837                                   charset_list)))
3838                 preferred_charset_id = -1;
3839               break;
3840             default:
3841               abort ();
3842             }
3843           charbuf += -c - 1;
3844           continue;
3845         }
3846
3847       /* Now encode the character C.  */
3848       if (c < 0x20 || c == 0x7F)
3849         {
3850           if (c == '\n'
3851               || (c == '\r' && EQ (eol_type, Qmac)))
3852             {
3853               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3854                 ENCODE_RESET_PLANE_AND_REGISTER ();
3855               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3856                 {
3857                   int i;
3858
3859                   for (i = 0; i < 4; i++)
3860                     CODING_ISO_DESIGNATION (coding, i)
3861                       = CODING_ISO_INITIAL (coding, i);
3862                 }
3863               bol_designation
3864                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3865             }
3866           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3867             ENCODE_RESET_PLANE_AND_REGISTER ();
3868           EMIT_ONE_ASCII_BYTE (c);
3869         }
3870       else if (ASCII_CHAR_P (c))
3871         {
3872           if (ascii_compatible)
3873             EMIT_ONE_ASCII_BYTE (c);
3874           else
3875             {
3876               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3877               ENCODE_ISO_CHARACTER (charset, c);
3878             }
3879         }
3880       else if (CHAR_BYTE8_P (c))
3881         {
3882           c = CHAR_TO_BYTE8 (c);
3883           EMIT_ONE_BYTE (c);
3884         }
3885       else
3886         {
3887           struct charset *charset;
3888
3889           if (preferred_charset_id >= 0)
3890             {
3891               charset = CHARSET_FROM_ID (preferred_charset_id);
3892               if (! CHAR_CHARSET_P (c, charset))
3893                 charset = char_charset (c, charset_list, NULL);
3894             }
3895           else
3896             charset = char_charset (c, charset_list, NULL);
3897           if (!charset)
3898             {
3899               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3900                 {
3901                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3902                   charset = CHARSET_FROM_ID (charset_ascii);
3903                 }
3904               else
3905                 {
3906                   c = coding->default_char;
3907                   charset = char_charset (c, charset_list, NULL);
3908                 }
3909             }
3910           ENCODE_ISO_CHARACTER (charset, c);
3911         }
3912     }
3913
3914   if (coding->mode & CODING_MODE_LAST_BLOCK
3915       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3916     {
3917       ASSURE_DESTINATION (safe_room);
3918       ENCODE_RESET_PLANE_AND_REGISTER ();
3919     }
3920   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3921   CODING_ISO_BOL (coding) = bol_designation;
3922   coding->produced_char += produced_chars;
3923   coding->produced = dst - coding->destination;
3924   return 0;
3925 }
3926
3927 \f
3928 /*** 8,9. SJIS and BIG5 handlers ***/
3929
3930 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3931    quite widely.  So, for the moment, Emacs supports them in the bare
3932    C code.  But, in the future, they may be supported only by CCL.  */
3933
3934 /* SJIS is a coding system encoding three character sets: ASCII, right
3935    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3936    as is.  A character of charset katakana-jisx0201 is encoded by
3937    "position-code + 0x80".  A character of charset japanese-jisx0208
3938    is encoded in 2-byte but two position-codes are divided and shifted
3939    so that it fit in the range below.
3940
3941    --- CODE RANGE of SJIS ---
3942    (character set)      (range)
3943    ASCII                0x00 .. 0x7F
3944    KATAKANA-JISX0201    0xA0 .. 0xDF
3945    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3946             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3947    -------------------------------
3948
3949 */
3950
3951 /* BIG5 is a coding system encoding two character sets: ASCII and
3952    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3953    character set and is encoded in two-byte.
3954
3955    --- CODE RANGE of BIG5 ---
3956    (character set)      (range)
3957    ASCII                0x00 .. 0x7F
3958    Big5 (1st byte)      0xA1 .. 0xFE
3959         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3960    --------------------------
3961
3962   */
3963
3964 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3965    Check if a text is encoded in SJIS.  If it is, return
3966    CATEGORY_MASK_SJIS, else return 0.  */
3967
3968 static int
3969 detect_coding_sjis (coding, detect_info)
3970      struct coding_system *coding;
3971      struct coding_detection_info *detect_info;
3972 {
3973   const unsigned char *src = coding->source, *src_base;
3974   const unsigned char *src_end = coding->source + coding->src_bytes;
3975   int multibytep = coding->src_multibyte;
3976   int consumed_chars = 0;
3977   int found = 0;
3978   int c;
3979
3980   detect_info->checked |= CATEGORY_MASK_SJIS;
3981   /* A coding system of this category is always ASCII compatible.  */
3982   src += coding->head_ascii;
3983
3984   while (1)
3985     {
3986       src_base = src;
3987       ONE_MORE_BYTE (c);
3988       if (c < 0x80)
3989         continue;
3990       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3991         {
3992           ONE_MORE_BYTE (c);
3993           if (c < 0x40 || c == 0x7F || c > 0xFC)
3994             break;
3995           found = CATEGORY_MASK_SJIS;
3996         }
3997       else if (c >= 0xA0 && c < 0xE0)
3998         found = CATEGORY_MASK_SJIS;
3999       else
4000         break;
4001     }
4002   detect_info->rejected |= CATEGORY_MASK_SJIS;
4003   return 0;
4004
4005  no_more_source:
4006   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4007     {
4008       detect_info->rejected |= CATEGORY_MASK_SJIS;
4009       return 0;
4010     }
4011   detect_info->found |= found;
4012   return 1;
4013 }
4014
4015 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4016    Check if a text is encoded in BIG5.  If it is, return
4017    CATEGORY_MASK_BIG5, else return 0.  */
4018
4019 static int
4020 detect_coding_big5 (coding, detect_info)
4021      struct coding_system *coding;
4022      struct coding_detection_info *detect_info;
4023 {
4024   const unsigned char *src = coding->source, *src_base;
4025   const unsigned char *src_end = coding->source + coding->src_bytes;
4026   int multibytep = coding->src_multibyte;
4027   int consumed_chars = 0;
4028   int found = 0;
4029   int c;
4030
4031   detect_info->checked |= CATEGORY_MASK_BIG5;
4032   /* A coding system of this category is always ASCII compatible.  */
4033   src += coding->head_ascii;
4034
4035   while (1)
4036     {
4037       src_base = src;
4038       ONE_MORE_BYTE (c);
4039       if (c < 0x80)
4040         continue;
4041       if (c >= 0xA1)
4042         {
4043           ONE_MORE_BYTE (c);
4044           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4045             return 0;
4046           found = CATEGORY_MASK_BIG5;
4047         }
4048       else
4049         break;
4050     }
4051   detect_info->rejected |= CATEGORY_MASK_BIG5;
4052   return 0;
4053
4054  no_more_source:
4055   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4056     {
4057       detect_info->rejected |= CATEGORY_MASK_BIG5;
4058       return 0;
4059     }
4060   detect_info->found |= found;
4061   return 1;
4062 }
4063
4064 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4065    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4066
4067 static void
4068 decode_coding_sjis (coding)
4069      struct coding_system *coding;
4070 {
4071   const unsigned char *src = coding->source + coding->consumed;
4072   const unsigned char *src_end = coding->source + coding->src_bytes;
4073   const unsigned char *src_base;
4074   int *charbuf = coding->charbuf + coding->charbuf_used;
4075   int *charbuf_end
4076     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4077   int consumed_chars = 0, consumed_chars_base;
4078   int multibytep = coding->src_multibyte;
4079   struct charset *charset_roman, *charset_kanji, *charset_kana;
4080   struct charset *charset_kanji2;
4081   Lisp_Object attrs, charset_list, val;
4082   int char_offset = coding->produced_char;
4083   int last_offset = char_offset;
4084   int last_id = charset_ascii;
4085
4086   CODING_GET_INFO (coding, attrs, charset_list);
4087
4088   val = charset_list;
4089   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4090   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4091   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4092   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4093
4094   while (1)
4095     {
4096       int c, c1;
4097       struct charset *charset;
4098
4099       src_base = src;
4100       consumed_chars_base = consumed_chars;
4101
4102       if (charbuf >= charbuf_end)
4103         break;
4104
4105       ONE_MORE_BYTE (c);
4106       if (c < 0)
4107         goto invalid_code;
4108       if (c < 0x80)
4109         charset = charset_roman;
4110       else if (c == 0x80 || c == 0xA0)
4111         goto invalid_code;
4112       else if (c >= 0xA1 && c <= 0xDF)
4113         {
4114           /* SJIS -> JISX0201-Kana */
4115           c &= 0x7F;
4116           charset = charset_kana;
4117         }
4118       else if (c <= 0xEF)
4119         {
4120           /* SJIS -> JISX0208 */
4121           ONE_MORE_BYTE (c1);
4122           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4123             goto invalid_code;
4124           c = (c << 8) | c1;
4125           SJIS_TO_JIS (c);
4126           charset = charset_kanji;
4127         }
4128       else if (c <= 0xFC && charset_kanji2)
4129         {
4130           /* SJIS -> JISX0213-2 */
4131           ONE_MORE_BYTE (c1);
4132           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4133             goto invalid_code;
4134           c = (c << 8) | c1;
4135           SJIS_TO_JIS2 (c);
4136           charset = charset_kanji2;
4137         }
4138       else
4139         goto invalid_code;
4140       if (charset->id != charset_ascii
4141           && last_id != charset->id)
4142         {
4143           if (last_id != charset_ascii)
4144             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4145           last_id = charset->id;
4146           last_offset = char_offset;
4147         }
4148       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4149       *charbuf++ = c;
4150       char_offset++;
4151       continue;
4152
4153     invalid_code:
4154       src = src_base;
4155       consumed_chars = consumed_chars_base;
4156       ONE_MORE_BYTE (c);
4157       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4158       char_offset++;
4159       coding->errors++;
4160     }
4161
4162  no_more_source:
4163   if (last_id != charset_ascii)
4164     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4165   coding->consumed_char += consumed_chars_base;
4166   coding->consumed = src_base - coding->source;
4167   coding->charbuf_used = charbuf - coding->charbuf;
4168 }
4169
4170 static void
4171 decode_coding_big5 (coding)
4172      struct coding_system *coding;
4173 {
4174   const unsigned char *src = coding->source + coding->consumed;
4175   const unsigned char *src_end = coding->source + coding->src_bytes;
4176   const unsigned char *src_base;
4177   int *charbuf = coding->charbuf + coding->charbuf_used;
4178   int *charbuf_end
4179     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4180   int consumed_chars = 0, consumed_chars_base;
4181   int multibytep = coding->src_multibyte;
4182   struct charset *charset_roman, *charset_big5;
4183   Lisp_Object attrs, charset_list, val;
4184   int char_offset = coding->produced_char;
4185   int last_offset = char_offset;
4186   int last_id = charset_ascii;
4187
4188   CODING_GET_INFO (coding, attrs, charset_list);
4189   val = charset_list;
4190   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4191   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4192
4193   while (1)
4194     {
4195       int c, c1;
4196       struct charset *charset;
4197
4198       src_base = src;
4199       consumed_chars_base = consumed_chars;
4200
4201       if (charbuf >= charbuf_end)
4202         break;
4203
4204       ONE_MORE_BYTE (c);
4205
4206       if (c < 0)
4207         goto invalid_code;
4208       if (c < 0x80)
4209         charset = charset_roman;
4210       else
4211         {
4212           /* BIG5 -> Big5 */
4213           if (c < 0xA1 || c > 0xFE)
4214             goto invalid_code;
4215           ONE_MORE_BYTE (c1);
4216           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4217             goto invalid_code;
4218           c = c << 8 | c1;
4219           charset = charset_big5;
4220         }
4221       if (charset->id != charset_ascii
4222           && last_id != charset->id)
4223         {
4224           if (last_id != charset_ascii)
4225             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4226           last_id = charset->id;
4227           last_offset = char_offset;
4228         }
4229       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4230       *charbuf++ = c;
4231       char_offset++;
4232       continue;
4233
4234     invalid_code:
4235       src = src_base;
4236       consumed_chars = consumed_chars_base;
4237       ONE_MORE_BYTE (c);
4238       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4239       char_offset++;
4240       coding->errors++;
4241     }
4242
4243  no_more_source:
4244   if (last_id != charset_ascii)
4245     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4246   coding->consumed_char += consumed_chars_base;
4247   coding->consumed = src_base - coding->source;
4248   coding->charbuf_used = charbuf - coding->charbuf;
4249 }
4250
4251 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4252    This function can encode charsets `ascii', `katakana-jisx0201',
4253    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4254    are sure that all these charsets are registered as official charset
4255    (i.e. do not have extended leading-codes).  Characters of other
4256    charsets are produced without any encoding.  If SJIS_P is 1, encode
4257    SJIS text, else encode BIG5 text.  */
4258
4259 static int
4260 encode_coding_sjis (coding)
4261      struct coding_system *coding;
4262 {
4263   int multibytep = coding->dst_multibyte;
4264   int *charbuf = coding->charbuf;
4265   int *charbuf_end = charbuf + coding->charbuf_used;
4266   unsigned char *dst = coding->destination + coding->produced;
4267   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4268   int safe_room = 4;
4269   int produced_chars = 0;
4270   Lisp_Object attrs, charset_list, val;
4271   int ascii_compatible;
4272   struct charset *charset_roman, *charset_kanji, *charset_kana;
4273   struct charset *charset_kanji2;
4274   int c;
4275
4276   CODING_GET_INFO (coding, attrs, charset_list);
4277   val = charset_list;
4278   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4279   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4280   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4281   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4282
4283   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4284
4285   while (charbuf < charbuf_end)
4286     {
4287       ASSURE_DESTINATION (safe_room);
4288       c = *charbuf++;
4289       /* Now encode the character C.  */
4290       if (ASCII_CHAR_P (c) && ascii_compatible)
4291         EMIT_ONE_ASCII_BYTE (c);
4292       else if (CHAR_BYTE8_P (c))
4293         {
4294           c = CHAR_TO_BYTE8 (c);
4295           EMIT_ONE_BYTE (c);
4296         }
4297       else
4298         {
4299           unsigned code;
4300           struct charset *charset = char_charset (c, charset_list, &code);
4301
4302           if (!charset)
4303             {
4304               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4305                 {
4306                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4307                   charset = CHARSET_FROM_ID (charset_ascii);
4308                 }
4309               else
4310                 {
4311                   c = coding->default_char;
4312                   charset = char_charset (c, charset_list, &code);
4313                 }
4314             }
4315           if (code == CHARSET_INVALID_CODE (charset))
4316             abort ();
4317           if (charset == charset_kanji)
4318             {
4319               int c1, c2;
4320               JIS_TO_SJIS (code);
4321               c1 = code >> 8, c2 = code & 0xFF;
4322               EMIT_TWO_BYTES (c1, c2);
4323             }
4324           else if (charset == charset_kana)
4325             EMIT_ONE_BYTE (code | 0x80);
4326           else if (charset_kanji2 && charset == charset_kanji2)
4327             {
4328               int c1, c2;
4329
4330               c1 = code >> 8;
4331               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4332                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4333                 {
4334                   JIS_TO_SJIS2 (code);
4335                   c1 = code >> 8, c2 = code & 0xFF;
4336                   EMIT_TWO_BYTES (c1, c2);
4337                 }
4338               else
4339                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4340             }
4341           else
4342             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4343         }
4344     }
4345   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4346   coding->produced_char += produced_chars;
4347   coding->produced = dst - coding->destination;
4348   return 0;
4349 }
4350
4351 static int
4352 encode_coding_big5 (coding)
4353      struct coding_system *coding;
4354 {
4355   int multibytep = coding->dst_multibyte;
4356   int *charbuf = coding->charbuf;
4357   int *charbuf_end = charbuf + coding->charbuf_used;
4358   unsigned char *dst = coding->destination + coding->produced;
4359   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4360   int safe_room = 4;
4361   int produced_chars = 0;
4362   Lisp_Object attrs, charset_list, val;
4363   int ascii_compatible;
4364   struct charset *charset_roman, *charset_big5;
4365   int c;
4366
4367   CODING_GET_INFO (coding, attrs, charset_list);
4368   val = charset_list;
4369   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4370   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4371   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4372
4373   while (charbuf < charbuf_end)
4374     {
4375       ASSURE_DESTINATION (safe_room);
4376       c = *charbuf++;
4377       /* Now encode the character C.  */
4378       if (ASCII_CHAR_P (c) && ascii_compatible)
4379         EMIT_ONE_ASCII_BYTE (c);
4380       else if (CHAR_BYTE8_P (c))
4381         {
4382           c = CHAR_TO_BYTE8 (c);
4383           EMIT_ONE_BYTE (c);
4384         }
4385       else
4386         {
4387           unsigned code;
4388           struct charset *charset = char_charset (c, charset_list, &code);
4389
4390           if (! charset)
4391             {
4392               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4393                 {
4394                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4395                   charset = CHARSET_FROM_ID (charset_ascii);
4396                 }
4397               else
4398                 {
4399                   c = coding->default_char;
4400                   charset = char_charset (c, charset_list, &code);
4401                 }
4402             }
4403           if (code == CHARSET_INVALID_CODE (charset))
4404             abort ();
4405           if (charset == charset_big5)
4406             {
4407               int c1, c2;
4408
4409               c1 = code >> 8, c2 = code & 0xFF;
4410               EMIT_TWO_BYTES (c1, c2);
4411             }
4412           else
4413             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4414         }
4415     }
4416   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4417   coding->produced_char += produced_chars;
4418   coding->produced = dst - coding->destination;
4419   return 0;
4420 }
4421
4422 \f
4423 /*** 10. CCL handlers ***/
4424
4425 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4426    Check if a text is encoded in a coding system of which
4427    encoder/decoder are written in CCL program.  If it is, return
4428    CATEGORY_MASK_CCL, else return 0.  */
4429
4430 static int
4431 detect_coding_ccl (coding, detect_info)
4432      struct coding_system *coding;
4433      struct coding_detection_info *detect_info;
4434 {
4435   const unsigned char *src = coding->source, *src_base;
4436   const unsigned char *src_end = coding->source + coding->src_bytes;
4437   int multibytep = coding->src_multibyte;
4438   int consumed_chars = 0;
4439   int found = 0;
4440   unsigned char *valids;
4441   int head_ascii = coding->head_ascii;
4442   Lisp_Object attrs;
4443
4444   detect_info->checked |= CATEGORY_MASK_CCL;
4445
4446   coding = &coding_categories[coding_category_ccl];
4447   valids = CODING_CCL_VALIDS (coding);
4448   attrs = CODING_ID_ATTRS (coding->id);
4449   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4450     src += head_ascii;
4451
4452   while (1)
4453     {
4454       int c;
4455
4456       src_base = src;
4457       ONE_MORE_BYTE (c);
4458       if (c < 0 || ! valids[c])
4459         break;
4460       if ((valids[c] > 1))
4461         found = CATEGORY_MASK_CCL;
4462     }
4463   detect_info->rejected |= CATEGORY_MASK_CCL;
4464   return 0;
4465
4466  no_more_source:
4467   detect_info->found |= found;
4468   return 1;
4469 }
4470
4471 static void
4472 decode_coding_ccl (coding)
4473      struct coding_system *coding;
4474 {
4475   const unsigned char *src = coding->source + coding->consumed;
4476   const unsigned char *src_end = coding->source + coding->src_bytes;
4477   int *charbuf = coding->charbuf + coding->charbuf_used;
4478   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4479   int consumed_chars = 0;
4480   int multibytep = coding->src_multibyte;
4481   struct ccl_program ccl;
4482   int source_charbuf[1024];
4483   int source_byteidx[1024];
4484   Lisp_Object attrs, charset_list;
4485
4486   CODING_GET_INFO (coding, attrs, charset_list);
4487   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4488
4489   while (src < src_end)
4490     {
4491       const unsigned char *p = src;
4492       int *source, *source_end;
4493       int i = 0;
4494
4495       if (multibytep)
4496         while (i < 1024 && p < src_end)
4497           {
4498             source_byteidx[i] = p - src;
4499             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4500           }
4501       else
4502         while (i < 1024 && p < src_end)
4503           source_charbuf[i++] = *p++;
4504
4505       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4506         ccl.last_block = 1;
4507
4508       source = source_charbuf;
4509       source_end = source + i;
4510       while (source < source_end)
4511         {
4512           ccl_driver (&ccl, source, charbuf,
4513                       source_end - source, charbuf_end - charbuf,
4514                       charset_list);
4515           source += ccl.consumed;
4516           charbuf += ccl.produced;
4517           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4518             break;
4519         }
4520       if (source < source_end)
4521         src += source_byteidx[source - source_charbuf];
4522       else
4523         src = p;
4524       consumed_chars += source - source_charbuf;
4525
4526       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4527           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4528         break;
4529     }
4530
4531   switch (ccl.status)
4532     {
4533     case CCL_STAT_SUSPEND_BY_SRC:
4534       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4535       break;
4536     case CCL_STAT_SUSPEND_BY_DST:
4537       break;
4538     case CCL_STAT_QUIT:
4539     case CCL_STAT_INVALID_CMD:
4540       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4541       break;
4542     default:
4543       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4544       break;
4545     }
4546   coding->consumed_char += consumed_chars;
4547   coding->consumed = src - coding->source;
4548   coding->charbuf_used = charbuf - coding->charbuf;
4549 }
4550
4551 static int
4552 encode_coding_ccl (coding)
4553      struct coding_system *coding;
4554 {
4555   struct ccl_program ccl;
4556   int multibytep = coding->dst_multibyte;
4557   int *charbuf = coding->charbuf;
4558   int *charbuf_end = charbuf + coding->charbuf_used;
4559   unsigned char *dst = coding->destination + coding->produced;
4560   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4561   int destination_charbuf[1024];
4562   int i, produced_chars = 0;
4563   Lisp_Object attrs, charset_list;
4564
4565   CODING_GET_INFO (coding, attrs, charset_list);
4566   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4567
4568   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4569   ccl.dst_multibyte = coding->dst_multibyte;
4570
4571   while (charbuf < charbuf_end)
4572     {
4573       ccl_driver (&ccl, charbuf, destination_charbuf,
4574                   charbuf_end - charbuf, 1024, charset_list);
4575       if (multibytep)
4576         {
4577           ASSURE_DESTINATION (ccl.produced * 2);
4578           for (i = 0; i < ccl.produced; i++)
4579             EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4580         }
4581       else
4582         {
4583           ASSURE_DESTINATION (ccl.produced);
4584           for (i = 0; i < ccl.produced; i++)
4585             *dst++ = destination_charbuf[i] & 0xFF;
4586           produced_chars += ccl.produced;
4587         }
4588       charbuf += ccl.consumed;
4589       if (ccl.status == CCL_STAT_QUIT
4590           || ccl.status == CCL_STAT_INVALID_CMD)
4591         break;
4592     }
4593
4594   switch (ccl.status)
4595     {
4596     case CCL_STAT_SUSPEND_BY_SRC:
4597       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4598       break;
4599     case CCL_STAT_SUSPEND_BY_DST:
4600       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4601       break;
4602     case CCL_STAT_QUIT:
4603     case CCL_STAT_INVALID_CMD:
4604       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4605       break;
4606     default:
4607       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4608       break;
4609     }
4610
4611   coding->produced_char += produced_chars;
4612   coding->produced = dst - coding->destination;
4613   return 0;
4614 }
4615
4616
4617 \f
4618 /*** 10, 11. no-conversion handlers ***/
4619
4620 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4621
4622 static void
4623 decode_coding_raw_text (coding)
4624      struct coding_system *coding;
4625 {
4626   coding->chars_at_source = 1;
4627   coding->consumed_char = 0;
4628   coding->consumed = 0;
4629   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4630 }
4631
4632 static int
4633 encode_coding_raw_text (coding)
4634      struct coding_system *coding;
4635 {
4636   int multibytep = coding->dst_multibyte;
4637   int *charbuf = coding->charbuf;
4638   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4639   unsigned char *dst = coding->destination + coding->produced;
4640   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4641   int produced_chars = 0;
4642   int c;
4643
4644   if (multibytep)
4645     {
4646       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4647
4648       if (coding->src_multibyte)
4649         while (charbuf < charbuf_end)
4650           {
4651             ASSURE_DESTINATION (safe_room);
4652             c = *charbuf++;
4653             if (ASCII_CHAR_P (c))
4654               EMIT_ONE_ASCII_BYTE (c);
4655             else if (CHAR_BYTE8_P (c))
4656               {
4657                 c = CHAR_TO_BYTE8 (c);
4658                 EMIT_ONE_BYTE (c);
4659               }
4660             else
4661               {
4662                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4663
4664                 CHAR_STRING_ADVANCE (c, p1);
4665                 while (p0 < p1)
4666                   {
4667                     EMIT_ONE_BYTE (*p0);
4668                     p0++;
4669                   }
4670               }
4671           }
4672       else
4673         while (charbuf < charbuf_end)
4674           {
4675             ASSURE_DESTINATION (safe_room);
4676             c = *charbuf++;
4677             EMIT_ONE_BYTE (c);
4678           }
4679     }
4680   else
4681     {
4682       if (coding->src_multibyte)
4683         {
4684           int safe_room = MAX_MULTIBYTE_LENGTH;
4685
4686           while (charbuf < charbuf_end)
4687             {
4688               ASSURE_DESTINATION (safe_room);
4689               c = *charbuf++;
4690               if (ASCII_CHAR_P (c))
4691                 *dst++ = c;
4692               else if (CHAR_BYTE8_P (c))
4693                 *dst++ = CHAR_TO_BYTE8 (c);
4694               else
4695                 CHAR_STRING_ADVANCE (c, dst);
4696               produced_chars++;
4697             }
4698         }
4699       else
4700         {
4701           ASSURE_DESTINATION (charbuf_end - charbuf);
4702           while (charbuf < charbuf_end && dst < dst_end)
4703             *dst++ = *charbuf++;
4704           produced_chars = dst - (coding->destination + coding->dst_bytes);
4705         }
4706     }
4707   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4708   coding->produced_char += produced_chars;
4709   coding->produced = dst - coding->destination;
4710   return 0;
4711 }
4712
4713 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4714    Check if a text is encoded in a charset-based coding system.  If it
4715    is, return 1, else return 0.  */
4716
4717 static int
4718 detect_coding_charset (coding, detect_info)
4719      struct coding_system *coding;
4720      struct coding_detection_info *detect_info;
4721 {
4722   const unsigned char *src = coding->source, *src_base;
4723   const unsigned char *src_end = coding->source + coding->src_bytes;
4724   int multibytep = coding->src_multibyte;
4725   int consumed_chars = 0;
4726   Lisp_Object attrs, valids;
4727   int found = 0;
4728
4729   detect_info->checked |= CATEGORY_MASK_CHARSET;
4730
4731   coding = &coding_categories[coding_category_charset];
4732   attrs = CODING_ID_ATTRS (coding->id);
4733   valids = AREF (attrs, coding_attr_charset_valids);
4734
4735   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4736     src += coding->head_ascii;
4737
4738   while (1)
4739     {
4740       int c;
4741
4742       src_base = src;
4743       ONE_MORE_BYTE (c);
4744       if (c < 0)
4745         continue;
4746       if (NILP (AREF (valids, c)))
4747         break;
4748       if (c >= 0x80)
4749         found = CATEGORY_MASK_CHARSET;
4750     }
4751   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4752   return 0;
4753
4754  no_more_source:
4755   detect_info->found |= found;
4756   return 1;
4757 }
4758
4759 static void
4760 decode_coding_charset (coding)
4761      struct coding_system *coding;
4762 {
4763   const unsigned char *src = coding->source + coding->consumed;
4764   const unsigned char *src_end = coding->source + coding->src_bytes;
4765   const unsigned char *src_base;
4766   int *charbuf = coding->charbuf + coding->charbuf_used;
4767   int *charbuf_end
4768     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4769   int consumed_chars = 0, consumed_chars_base;
4770   int multibytep = coding->src_multibyte;
4771   Lisp_Object attrs, charset_list, valids;
4772   int char_offset = coding->produced_char;
4773   int last_offset = char_offset;
4774   int last_id = charset_ascii;
4775
4776   CODING_GET_INFO (coding, attrs, charset_list);
4777   valids = AREF (attrs, coding_attr_charset_valids);
4778
4779   while (1)
4780     {
4781       int c;
4782       Lisp_Object val;
4783       struct charset *charset;
4784       int dim;
4785       int len = 1;
4786       unsigned code;
4787
4788       src_base = src;
4789       consumed_chars_base = consumed_chars;
4790
4791       if (charbuf >= charbuf_end)
4792         break;
4793
4794       ONE_MORE_BYTE (c);
4795       if (c < 0)
4796         goto invalid_code;
4797       code = c;
4798
4799       val = AREF (valids, c);
4800       if (NILP (val))
4801         goto invalid_code;
4802       if (INTEGERP (val))
4803         {
4804           charset = CHARSET_FROM_ID (XFASTINT (val));
4805           dim = CHARSET_DIMENSION (charset);
4806           while (len < dim)
4807             {
4808               ONE_MORE_BYTE (c);
4809               code = (code << 8) | c;
4810               len++;
4811             }
4812           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4813                               charset, code, c);
4814         }
4815       else
4816         {
4817           /* VAL is a list of charset IDs.  It is assured that the
4818              list is sorted by charset dimensions (smaller one
4819              comes first).  */
4820           while (CONSP (val))
4821             {
4822               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4823               dim = CHARSET_DIMENSION (charset);
4824               while (len < dim)
4825                 {
4826                   ONE_MORE_BYTE (c);
4827                   code = (code << 8) | c;
4828                   len++;
4829                 }
4830               CODING_DECODE_CHAR (coding, src, src_base,
4831                                   src_end, charset, code, c);
4832               if (c >= 0)
4833                 break;
4834               val = XCDR (val);
4835             }
4836         }
4837       if (c < 0)
4838         goto invalid_code;
4839       if (charset->id != charset_ascii
4840           && last_id != charset->id)
4841         {
4842           if (last_id != charset_ascii)
4843             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4844           last_id = charset->id;
4845           last_offset = char_offset;
4846         }
4847
4848       *charbuf++ = c;
4849       char_offset++;
4850       continue;
4851
4852     invalid_code:
4853       src = src_base;
4854       consumed_chars = consumed_chars_base;
4855       ONE_MORE_BYTE (c);
4856       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4857       char_offset++;
4858       coding->errors++;
4859     }
4860
4861  no_more_source:
4862   if (last_id != charset_ascii)
4863     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4864   coding->consumed_char += consumed_chars_base;
4865   coding->consumed = src_base - coding->source;
4866   coding->charbuf_used = charbuf - coding->charbuf;
4867 }
4868
4869 static int
4870 encode_coding_charset (coding)
4871      struct coding_system *coding;
4872 {
4873   int multibytep = coding->dst_multibyte;
4874   int *charbuf = coding->charbuf;
4875   int *charbuf_end = charbuf + coding->charbuf_used;
4876   unsigned char *dst = coding->destination + coding->produced;
4877   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4878   int safe_room = MAX_MULTIBYTE_LENGTH;
4879   int produced_chars = 0;
4880   Lisp_Object attrs, charset_list;
4881   int ascii_compatible;
4882   int c;
4883
4884   CODING_GET_INFO (coding, attrs, charset_list);
4885   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4886
4887   while (charbuf < charbuf_end)
4888     {
4889       struct charset *charset;
4890       unsigned code;
4891
4892       ASSURE_DESTINATION (safe_room);
4893       c = *charbuf++;
4894       if (ascii_compatible && ASCII_CHAR_P (c))
4895         EMIT_ONE_ASCII_BYTE (c);
4896       else if (CHAR_BYTE8_P (c))
4897         {
4898           c = CHAR_TO_BYTE8 (c);
4899           EMIT_ONE_BYTE (c);
4900         }
4901       else
4902         {
4903           charset = char_charset (c, charset_list, &code);
4904           if (charset)
4905             {
4906               if (CHARSET_DIMENSION (charset) == 1)
4907                 EMIT_ONE_BYTE (code);
4908               else if (CHARSET_DIMENSION (charset) == 2)
4909                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4910               else if (CHARSET_DIMENSION (charset) == 3)
4911                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4912               else
4913                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4914                                  (code >> 8) & 0xFF, code & 0xFF);
4915             }
4916           else
4917             {
4918               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4919                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4920               else
4921                 c = coding->default_char;
4922               EMIT_ONE_BYTE (c);
4923             }
4924         }
4925     }
4926
4927   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4928   coding->produced_char += produced_chars;
4929   coding->produced = dst - coding->destination;
4930   return 0;
4931 }
4932
4933 \f
4934 /*** 7. C library functions ***/
4935
4936 /* Setup coding context CODING from information about CODING_SYSTEM.
4937    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4938    CODING_SYSTEM is invalid, signal an error.  */
4939
4940 void
4941 setup_coding_system (coding_system, coding)
4942      Lisp_Object coding_system;
4943      struct coding_system *coding;
4944 {
4945   Lisp_Object attrs;
4946   Lisp_Object eol_type;
4947   Lisp_Object coding_type;
4948   Lisp_Object val;
4949
4950   if (NILP (coding_system))
4951     coding_system = Qundecided;
4952
4953   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4954
4955   attrs = CODING_ID_ATTRS (coding->id);
4956   eol_type = CODING_ID_EOL_TYPE (coding->id);
4957
4958   coding->mode = 0;
4959   coding->head_ascii = -1;
4960   coding->common_flags
4961     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4962   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4963     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4964   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4965     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4966   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4967     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4968
4969   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4970   coding->max_charset_id = SCHARS (val) - 1;
4971   coding->safe_charsets = (char *) SDATA (val);
4972   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4973
4974   coding_type = CODING_ATTR_TYPE (attrs);
4975   if (EQ (coding_type, Qundecided))
4976     {
4977       coding->detector = NULL;
4978       coding->decoder = decode_coding_raw_text;
4979       coding->encoder = encode_coding_raw_text;
4980       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4981     }
4982   else if (EQ (coding_type, Qiso_2022))
4983     {
4984       int i;
4985       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4986
4987       /* Invoke graphic register 0 to plane 0.  */
4988       CODING_ISO_INVOCATION (coding, 0) = 0;
4989       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4990       CODING_ISO_INVOCATION (coding, 1)
4991         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4992       /* Setup the initial status of designation.  */
4993       for (i = 0; i < 4; i++)
4994         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4995       /* Not single shifting initially.  */
4996       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4997       /* Beginning of buffer should also be regarded as bol. */
4998       CODING_ISO_BOL (coding) = 1;
4999       coding->detector = detect_coding_iso_2022;
5000       coding->decoder = decode_coding_iso_2022;
5001       coding->encoder = encode_coding_iso_2022;
5002       if (flags & CODING_ISO_FLAG_SAFE)
5003         coding->mode |= CODING_MODE_SAFE_ENCODING;
5004       coding->common_flags
5005         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5006             | CODING_REQUIRE_FLUSHING_MASK);
5007       if (flags & CODING_ISO_FLAG_COMPOSITION)
5008         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
5009       if (flags & CODING_ISO_FLAG_DESIGNATION)
5010         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
5011       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5012         {
5013           setup_iso_safe_charsets (attrs);
5014           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5015           coding->max_charset_id = SCHARS (val) - 1;
5016           coding->safe_charsets = (char *) SDATA (val);
5017         }
5018       CODING_ISO_FLAGS (coding) = flags;
5019     }
5020   else if (EQ (coding_type, Qcharset))
5021     {
5022       coding->detector = detect_coding_charset;
5023       coding->decoder = decode_coding_charset;
5024       coding->encoder = encode_coding_charset;
5025       coding->common_flags
5026         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5027     }
5028   else if (EQ (coding_type, Qutf_8))
5029     {
5030       coding->detector = detect_coding_utf_8;
5031       coding->decoder = decode_coding_utf_8;
5032       coding->encoder = encode_coding_utf_8;
5033       coding->common_flags
5034         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5035     }
5036   else if (EQ (coding_type, Qutf_16))
5037     {
5038       val = AREF (attrs, coding_attr_utf_16_bom);
5039       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5040                                     : EQ (val, Qt) ? utf_16_with_bom
5041                                     : utf_16_without_bom);
5042       val = AREF (attrs, coding_attr_utf_16_endian);
5043       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5044                                        : utf_16_little_endian);
5045       CODING_UTF_16_SURROGATE (coding) = 0;
5046       coding->detector = detect_coding_utf_16;
5047       coding->decoder = decode_coding_utf_16;
5048       coding->encoder = encode_coding_utf_16;
5049       coding->common_flags
5050         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5051       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5052         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5053     }
5054   else if (EQ (coding_type, Qccl))
5055     {
5056       coding->detector = detect_coding_ccl;
5057       coding->decoder = decode_coding_ccl;
5058       coding->encoder = encode_coding_ccl;
5059       coding->common_flags
5060         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5061             | CODING_REQUIRE_FLUSHING_MASK);
5062     }
5063   else if (EQ (coding_type, Qemacs_mule))
5064     {
5065       coding->detector = detect_coding_emacs_mule;
5066       coding->decoder = decode_coding_emacs_mule;
5067       coding->encoder = encode_coding_emacs_mule;
5068       coding->common_flags
5069         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5070       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5071           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5072         {
5073           Lisp_Object tail, safe_charsets;
5074           int max_charset_id = 0;
5075
5076           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5077                tail = XCDR (tail))
5078             if (max_charset_id < XFASTINT (XCAR (tail)))
5079               max_charset_id = XFASTINT (XCAR (tail));
5080           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5081                                         make_number (255));
5082           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5083                tail = XCDR (tail))
5084             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5085           coding->max_charset_id = max_charset_id;
5086           coding->safe_charsets = (char *) SDATA (safe_charsets);
5087         }
5088     }
5089   else if (EQ (coding_type, Qshift_jis))
5090     {
5091       coding->detector = detect_coding_sjis;
5092       coding->decoder = decode_coding_sjis;
5093       coding->encoder = encode_coding_sjis;
5094       coding->common_flags
5095         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5096     }
5097   else if (EQ (coding_type, Qbig5))
5098     {
5099       coding->detector = detect_coding_big5;
5100       coding->decoder = decode_coding_big5;
5101       coding->encoder = encode_coding_big5;
5102       coding->common_flags
5103         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5104     }
5105   else                          /* EQ (coding_type, Qraw_text) */
5106     {
5107       coding->detector = NULL;
5108       coding->decoder = decode_coding_raw_text;
5109       coding->encoder = encode_coding_raw_text;
5110       if (! EQ (eol_type, Qunix))
5111         {
5112           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5113           if (! VECTORP (eol_type))
5114             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5115         }
5116
5117     }
5118
5119   return;
5120 }
5121
5122 /* Return a list of charsets supported by CODING.  */
5123
5124 Lisp_Object
5125 coding_charset_list (coding)
5126      struct coding_system *coding;
5127 {
5128   Lisp_Object attrs, charset_list;
5129
5130   CODING_GET_INFO (coding, attrs, charset_list);
5131   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5132     {
5133       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5134
5135       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5136         charset_list = Viso_2022_charset_list;
5137     }
5138   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5139     {
5140       charset_list = Vemacs_mule_charset_list;
5141     }
5142   return charset_list;
5143 }
5144
5145
5146 /* Return raw-text or one of its subsidiaries that has the same
5147    eol_type as CODING-SYSTEM.  */
5148
5149 Lisp_Object
5150 raw_text_coding_system (coding_system)
5151      Lisp_Object coding_system;
5152 {
5153   Lisp_Object spec, attrs;
5154   Lisp_Object eol_type, raw_text_eol_type;
5155
5156   if (NILP (coding_system))
5157     return Qraw_text;
5158   spec = CODING_SYSTEM_SPEC (coding_system);
5159   attrs = AREF (spec, 0);
5160
5161   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5162     return coding_system;
5163
5164   eol_type = AREF (spec, 2);
5165   if (VECTORP (eol_type))
5166     return Qraw_text;
5167   spec = CODING_SYSTEM_SPEC (Qraw_text);
5168   raw_text_eol_type = AREF (spec, 2);
5169   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5170           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5171           : AREF (raw_text_eol_type, 2));
5172 }
5173
5174
5175 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5176    does, return one of the subsidiary that has the same eol-spec as
5177    PARENT.  Otherwise, return CODING_SYSTEM.  */
5178
5179 Lisp_Object
5180 coding_inherit_eol_type (coding_system, parent)
5181      Lisp_Object coding_system, parent;
5182 {
5183   Lisp_Object spec, eol_type;
5184
5185   if (NILP (coding_system))
5186     coding_system = Qraw_text;
5187   spec = CODING_SYSTEM_SPEC (coding_system);
5188   eol_type = AREF (spec, 2);
5189   if (VECTORP (eol_type)
5190       && ! NILP (parent))
5191     {
5192       Lisp_Object parent_spec;
5193       Lisp_Object parent_eol_type;
5194
5195       parent_spec
5196         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5197       parent_eol_type = AREF (parent_spec, 2);
5198       if (EQ (parent_eol_type, Qunix))
5199         coding_system = AREF (eol_type, 0);
5200       else if (EQ (parent_eol_type, Qdos))
5201         coding_system = AREF (eol_type, 1);
5202       else if (EQ (parent_eol_type, Qmac))
5203         coding_system = AREF (eol_type, 2);
5204     }
5205   return coding_system;
5206 }
5207
5208 /* Emacs has a mechanism to automatically detect a coding system if it
5209    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5210    it's impossible to distinguish some coding systems accurately
5211    because they use the same range of codes.  So, at first, coding
5212    systems are categorized into 7, those are:
5213
5214    o coding-category-emacs-mule
5215
5216         The category for a coding system which has the same code range
5217         as Emacs' internal format.  Assigned the coding-system (Lisp
5218         symbol) `emacs-mule' by default.
5219
5220    o coding-category-sjis
5221
5222         The category for a coding system which has the same code range
5223         as SJIS.  Assigned the coding-system (Lisp
5224         symbol) `japanese-shift-jis' by default.
5225
5226    o coding-category-iso-7
5227
5228         The category for a coding system which has the same code range
5229         as ISO2022 of 7-bit environment.  This doesn't use any locking
5230         shift and single shift functions.  This can encode/decode all
5231         charsets.  Assigned the coding-system (Lisp symbol)
5232         `iso-2022-7bit' by default.
5233
5234    o coding-category-iso-7-tight
5235
5236         Same as coding-category-iso-7 except that this can
5237         encode/decode only the specified charsets.
5238
5239    o coding-category-iso-8-1
5240
5241         The category for a coding system which has the same code range
5242         as ISO2022 of 8-bit environment and graphic plane 1 used only
5243         for DIMENSION1 charset.  This doesn't use any locking shift
5244         and single shift functions.  Assigned the coding-system (Lisp
5245         symbol) `iso-latin-1' by default.
5246
5247    o coding-category-iso-8-2
5248
5249         The category for a coding system which has the same code range
5250         as ISO2022 of 8-bit environment and graphic plane 1 used only
5251         for DIMENSION2 charset.  This doesn't use any locking shift
5252         and single shift functions.  Assigned the coding-system (Lisp
5253         symbol) `japanese-iso-8bit' by default.
5254
5255    o coding-category-iso-7-else
5256
5257         The category for a coding system which has the same code range
5258         as ISO2022 of 7-bit environemnt but uses locking shift or
5259         single shift functions.  Assigned the coding-system (Lisp
5260         symbol) `iso-2022-7bit-lock' by default.
5261
5262    o coding-category-iso-8-else
5263
5264         The category for a coding system which has the same code range
5265         as ISO2022 of 8-bit environemnt but uses locking shift or
5266         single shift functions.  Assigned the coding-system (Lisp
5267         symbol) `iso-2022-8bit-ss2' by default.
5268
5269    o coding-category-big5
5270
5271         The category for a coding system which has the same code range
5272         as BIG5.  Assigned the coding-system (Lisp symbol)
5273         `cn-big5' by default.
5274
5275    o coding-category-utf-8
5276
5277         The category for a coding system which has the same code range
5278         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5279         symbol) `utf-8' by default.
5280
5281    o coding-category-utf-16-be
5282
5283         The category for a coding system in which a text has an
5284         Unicode signature (cf. Unicode Standard) in the order of BIG
5285         endian at the head.  Assigned the coding-system (Lisp symbol)
5286         `utf-16-be' by default.
5287
5288    o coding-category-utf-16-le
5289
5290         The category for a coding system in which a text has an
5291         Unicode signature (cf. Unicode Standard) in the order of
5292         LITTLE endian at the head.  Assigned the coding-system (Lisp
5293         symbol) `utf-16-le' by default.
5294
5295    o coding-category-ccl
5296
5297         The category for a coding system of which encoder/decoder is
5298         written in CCL programs.  The default value is nil, i.e., no
5299         coding system is assigned.
5300
5301    o coding-category-binary
5302
5303         The category for a coding system not categorized in any of the
5304         above.  Assigned the coding-system (Lisp symbol)
5305         `no-conversion' by default.
5306
5307    Each of them is a Lisp symbol and the value is an actual
5308    `coding-system's (this is also a Lisp symbol) assigned by a user.
5309    What Emacs does actually is to detect a category of coding system.
5310    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5311    decide only one possible category, it selects a category of the
5312    highest priority.  Priorities of categories are also specified by a
5313    user in a Lisp variable `coding-category-list'.
5314
5315 */
5316
5317 #define EOL_SEEN_NONE   0
5318 #define EOL_SEEN_LF     1
5319 #define EOL_SEEN_CR     2
5320 #define EOL_SEEN_CRLF   4
5321
5322 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5323    SOURCE is encoded.  If CATEGORY is one of
5324    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5325    two-byte, else they are encoded by one-byte.
5326
5327    Return one of EOL_SEEN_XXX.  */
5328
5329 #define MAX_EOL_CHECK_COUNT 3
5330
5331 static int
5332 detect_eol (source, src_bytes, category)
5333      const unsigned char *source;
5334      EMACS_INT src_bytes;
5335      enum coding_category category;
5336 {
5337   const unsigned char *src = source, *src_end = src + src_bytes;
5338   unsigned char c;
5339   int total  = 0;
5340   int eol_seen = EOL_SEEN_NONE;
5341
5342   if ((1 << category) & CATEGORY_MASK_UTF_16)
5343     {
5344       int msb, lsb;
5345
5346       msb = category == (coding_category_utf_16_le
5347                          | coding_category_utf_16_le_nosig);
5348       lsb = 1 - msb;
5349
5350       while (src + 1 < src_end)
5351         {
5352           c = src[lsb];
5353           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5354             {
5355               int this_eol;
5356
5357               if (c == '\n')
5358                 this_eol = EOL_SEEN_LF;
5359               else if (src + 3 >= src_end
5360                        || src[msb + 2] != 0
5361                        || src[lsb + 2] != '\n')
5362                 this_eol = EOL_SEEN_CR;
5363               else
5364                 this_eol = EOL_SEEN_CRLF;
5365
5366               if (eol_seen == EOL_SEEN_NONE)
5367                 /* This is the first end-of-line.  */
5368                 eol_seen = this_eol;
5369               else if (eol_seen != this_eol)
5370                 {
5371                   /* The found type is different from what found before.  */
5372                   eol_seen = EOL_SEEN_LF;
5373                   break;
5374                 }
5375               if (++total == MAX_EOL_CHECK_COUNT)
5376                 break;
5377             }
5378           src += 2;
5379         }
5380     }
5381   else
5382     {
5383       while (src < src_end)
5384         {
5385           c = *src++;
5386           if (c == '\n' || c == '\r')
5387             {
5388               int this_eol;
5389
5390               if (c == '\n')
5391                 this_eol = EOL_SEEN_LF;
5392               else if (src >= src_end || *src != '\n')
5393                 this_eol = EOL_SEEN_CR;
5394               else
5395                 this_eol = EOL_SEEN_CRLF, src++;
5396
5397               if (eol_seen == EOL_SEEN_NONE)
5398                 /* This is the first end-of-line.  */
5399                 eol_seen = this_eol;
5400               else if (eol_seen != this_eol)
5401                 {
5402                   /* The found type is different from what found before.  */
5403                   eol_seen = EOL_SEEN_LF;
5404                   break;
5405                 }
5406               if (++total == MAX_EOL_CHECK_COUNT)
5407                 break;
5408             }
5409         }
5410     }
5411   return eol_seen;
5412 }
5413
5414
5415 static Lisp_Object
5416 adjust_coding_eol_type (coding, eol_seen)
5417      struct coding_system *coding;
5418      int eol_seen;
5419 {
5420   Lisp_Object eol_type;
5421
5422   eol_type = CODING_ID_EOL_TYPE (coding->id);
5423   if (eol_seen & EOL_SEEN_LF)
5424     {
5425       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5426       eol_type = Qunix;
5427     }
5428   else if (eol_seen & EOL_SEEN_CRLF)
5429     {
5430       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5431       eol_type = Qdos;
5432     }
5433   else if (eol_seen & EOL_SEEN_CR)
5434     {
5435       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5436       eol_type = Qmac;
5437     }
5438   return eol_type;
5439 }
5440
5441 /* Detect how a text specified in CODING is encoded.  If a coding
5442    system is detected, update fields of CODING by the detected coding
5443    system.  */
5444
5445 void
5446 detect_coding (coding)
5447      struct coding_system *coding;
5448 {
5449   const unsigned char *src, *src_end;
5450
5451   coding->consumed = coding->consumed_char = 0;
5452   coding->produced = coding->produced_char = 0;
5453   coding_set_source (coding);
5454
5455   src_end = coding->source + coding->src_bytes;
5456
5457   /* If we have not yet decided the text encoding type, detect it
5458      now.  */
5459   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5460     {
5461       int c, i;
5462       struct coding_detection_info detect_info;
5463
5464       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5465       for (i = 0, src = coding->source; src < src_end; i++, src++)
5466         {
5467           c = *src;
5468           if (c & 0x80)
5469             break;
5470           if (c < 0x20
5471               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5472               && ! inhibit_iso_escape_detection
5473               && ! detect_info.checked)
5474             {
5475               coding->head_ascii = src - (coding->source + coding->consumed);
5476               if (detect_coding_iso_2022 (coding, &detect_info))
5477                 {
5478                   /* We have scanned the whole data.  */
5479                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5480                     /* We didn't find an 8-bit code.  */
5481                     src = src_end;
5482                   break;
5483                 }
5484             }
5485         }
5486       coding->head_ascii = src - (coding->source + coding->consumed);
5487
5488       if (coding->head_ascii < coding->src_bytes
5489           || detect_info.found)
5490         {
5491           enum coding_category category;
5492           struct coding_system *this;
5493
5494           if (coding->head_ascii == coding->src_bytes)
5495             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5496             for (i = 0; i < coding_category_raw_text; i++)
5497               {
5498                 category = coding_priorities[i];
5499                 this = coding_categories + category;
5500                 if (detect_info.found & (1 << category))
5501                   break;
5502               }
5503           else
5504             for (i = 0; i < coding_category_raw_text; i++)
5505               {
5506                 category = coding_priorities[i];
5507                 this = coding_categories + category;
5508                 if (this->id < 0)
5509                   {
5510                     /* No coding system of this category is defined.  */
5511                     detect_info.rejected |= (1 << category);
5512                   }
5513                 else if (category >= coding_category_raw_text)
5514                   continue;
5515                 else if (detect_info.checked & (1 << category))
5516                   {
5517                     if (detect_info.found & (1 << category))
5518                       break;
5519                   }
5520                 else if ((*(this->detector)) (coding, &detect_info)
5521                          && detect_info.found & (1 << category))
5522                   {
5523                     if (category == coding_category_utf_16_auto)
5524                       {
5525                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5526                           category = coding_category_utf_16_le;
5527                         else
5528                           category = coding_category_utf_16_be;
5529                       }
5530                     break;
5531                   }
5532               }
5533
5534           if (i < coding_category_raw_text)
5535             setup_coding_system (CODING_ID_NAME (this->id), coding);
5536           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5537             setup_coding_system (Qraw_text, coding);
5538           else if (detect_info.rejected)
5539             for (i = 0; i < coding_category_raw_text; i++)
5540               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5541                 {
5542                   this = coding_categories + coding_priorities[i];
5543                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5544                   break;
5545                 }
5546         }
5547     }
5548   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5549            == coding_category_utf_16_auto)
5550     {
5551       Lisp_Object coding_systems;
5552       struct coding_detection_info detect_info;
5553
5554       coding_systems
5555         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5556       detect_info.found = detect_info.rejected = 0;
5557       if (CONSP (coding_systems)
5558           && detect_coding_utf_16 (coding, &detect_info))
5559         {
5560           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5561             setup_coding_system (XCAR (coding_systems), coding);
5562           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5563             setup_coding_system (XCDR (coding_systems), coding);
5564         }
5565     }
5566 }
5567
5568
5569 static void
5570 decode_eol (coding)
5571      struct coding_system *coding;
5572 {
5573   Lisp_Object eol_type;
5574   unsigned char *p, *pbeg, *pend;
5575
5576   eol_type = CODING_ID_EOL_TYPE (coding->id);
5577   if (EQ (eol_type, Qunix))
5578     return;
5579
5580   if (NILP (coding->dst_object))
5581     pbeg = coding->destination;
5582   else
5583     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5584   pend = pbeg + coding->produced;
5585
5586   if (VECTORP (eol_type))
5587     {
5588       int eol_seen = EOL_SEEN_NONE;
5589
5590       for (p = pbeg; p < pend; p++)
5591         {
5592           if (*p == '\n')
5593             eol_seen |= EOL_SEEN_LF;
5594           else if (*p == '\r')
5595             {
5596               if (p + 1 < pend && *(p + 1) == '\n')
5597                 {
5598                   eol_seen |= EOL_SEEN_CRLF;
5599                   p++;
5600                 }
5601               else
5602                 eol_seen |= EOL_SEEN_CR;
5603             }
5604         }
5605       if (eol_seen != EOL_SEEN_NONE
5606           && eol_seen != EOL_SEEN_LF
5607           && eol_seen != EOL_SEEN_CRLF
5608           && eol_seen != EOL_SEEN_CR)
5609         eol_seen = EOL_SEEN_LF;
5610       if (eol_seen != EOL_SEEN_NONE)
5611         eol_type = adjust_coding_eol_type (coding, eol_seen);
5612     }
5613
5614   if (EQ (eol_type, Qmac))
5615     {
5616       for (p = pbeg; p < pend; p++)
5617         if (*p == '\r')
5618           *p = '\n';
5619     }
5620   else if (EQ (eol_type, Qdos))
5621     {
5622       int n = 0;
5623
5624       if (NILP (coding->dst_object))
5625         {
5626           /* Start deleting '\r' from the tail to minimize the memory
5627              movement.  */
5628           for (p = pend - 2; p >= pbeg; p--)
5629             if (*p == '\r')
5630               {
5631                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5632                 n++;
5633               }
5634         }
5635       else
5636         {
5637           int pos_byte = coding->dst_pos_byte;
5638           int pos = coding->dst_pos;
5639           int pos_end = pos + coding->produced_char - 1;
5640
5641           while (pos < pos_end)
5642             {
5643               p = BYTE_POS_ADDR (pos_byte);
5644               if (*p == '\r' && p[1] == '\n')
5645                 {
5646                   del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5647                   n++;
5648                   pos_end--;
5649                 }
5650               pos++;
5651               pos_byte += BYTES_BY_CHAR_HEAD (*p);
5652             }
5653         }
5654       coding->produced -= n;
5655       coding->produced_char -= n;
5656     }
5657 }
5658
5659
5660 /* Return a translation table (or list of them) from coding system
5661    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5662    decoding (ENCODEP is zero). */
5663
5664 static Lisp_Object
5665 get_translation_table (attrs, encodep, max_lookup)
5666      Lisp_Object attrs;
5667      int encodep, *max_lookup;
5668 {
5669   Lisp_Object standard, translation_table;
5670   Lisp_Object val;
5671
5672   if (encodep)
5673     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5674       standard = Vstandard_translation_table_for_encode;
5675   else
5676     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5677       standard = Vstandard_translation_table_for_decode;
5678   if (NILP (translation_table))
5679     translation_table = standard;
5680   else
5681     {
5682       if (SYMBOLP (translation_table))
5683         translation_table = Fget (translation_table, Qtranslation_table);
5684       else if (CONSP (translation_table))
5685         {
5686           translation_table = Fcopy_sequence (translation_table);
5687           for (val = translation_table; CONSP (val); val = XCDR (val))
5688             if (SYMBOLP (XCAR (val)))
5689               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5690         }
5691       if (CHAR_TABLE_P (standard))
5692         {
5693           if (CONSP (translation_table))
5694             translation_table = nconc2 (translation_table,
5695                                         Fcons (standard, Qnil));
5696           else
5697             translation_table = Fcons (translation_table,
5698                                        Fcons (standard, Qnil));
5699         }
5700     }
5701
5702   if (max_lookup)
5703     {
5704       *max_lookup = 1;
5705       if (CHAR_TABLE_P (translation_table)
5706           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5707         {
5708           val = XCHAR_TABLE (translation_table)->extras[1];
5709           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5710             *max_lookup = XFASTINT (val);
5711         }
5712       else if (CONSP (translation_table))
5713         {
5714           Lisp_Object tail, val;
5715
5716           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5717             if (CHAR_TABLE_P (XCAR (tail))
5718                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5719               {
5720                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5721                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5722                   *max_lookup = XFASTINT (val);
5723               }
5724         }
5725     }
5726   return translation_table;
5727 }
5728
5729 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5730   do {                                                          \
5731     trans = Qnil;                                               \
5732     if (CHAR_TABLE_P (table))                                   \
5733       {                                                         \
5734         trans = CHAR_TABLE_REF (table, c);                      \
5735         if (CHARACTERP (trans))                                 \
5736           c = XFASTINT (trans), trans = Qnil;                   \
5737       }                                                         \
5738     else if (CONSP (table))                                     \
5739       {                                                         \
5740         Lisp_Object tail;                                       \
5741                                                                 \
5742         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5743           if (CHAR_TABLE_P (XCAR (tail)))                       \
5744             {                                                   \
5745               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5746               if (CHARACTERP (trans))                           \
5747                 c = XFASTINT (trans), trans = Qnil;             \
5748               else if (! NILP (trans))                          \
5749                 break;                                          \
5750             }                                                   \
5751       }                                                         \
5752   } while (0)
5753
5754
5755 static Lisp_Object
5756 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5757      Lisp_Object val;
5758      int *buf, *buf_end;
5759      int last_block;
5760      int *from_nchars, *to_nchars;
5761 {
5762   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5763      [TO-CHAR ...].  */
5764   if (CONSP (val))
5765     {
5766       Lisp_Object from, tail;
5767       int i, len;
5768
5769       for (tail = val; CONSP (tail); tail = XCDR (tail))
5770         {
5771           val = XCAR (tail);
5772           from = XCAR (val);
5773           len = ASIZE (from);
5774           for (i = 0; i < len; i++)
5775             {
5776               if (buf + i == buf_end)
5777                 {
5778                   if (! last_block)
5779                     return Qt;
5780                   break;
5781                 }
5782               if (XINT (AREF (from, i)) != buf[i])
5783                 break;
5784             }
5785           if (i == len)
5786             {
5787               val = XCDR (val);
5788               *from_nchars = len;
5789               break;
5790             }
5791         }
5792       if (! CONSP (tail))
5793         return Qnil;
5794     }
5795   if (VECTORP (val))
5796     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5797   else
5798     *buf = XINT (val);
5799   return val;
5800 }
5801
5802
5803 static int
5804 produce_chars (coding, translation_table, last_block)
5805      struct coding_system *coding;
5806      Lisp_Object translation_table;
5807      int last_block;
5808 {
5809   unsigned char *dst = coding->destination + coding->produced;
5810   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5811   int produced;
5812   int produced_chars = 0;
5813   int carryover = 0;
5814
5815   if (! coding->chars_at_source)
5816     {
5817       /* Characters are in coding->charbuf.  */
5818       int *buf = coding->charbuf;
5819       int *buf_end = buf + coding->charbuf_used;
5820
5821       if (BUFFERP (coding->src_object)
5822           && EQ (coding->src_object, coding->dst_object))
5823         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5824
5825       while (buf < buf_end)
5826         {
5827           int c = *buf, i;
5828
5829           if (c >= 0)
5830             {
5831               int from_nchars = 1, to_nchars = 1;
5832               Lisp_Object trans = Qnil;
5833
5834               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5835               if (! NILP (trans))
5836                 {
5837                   trans = get_translation (trans, buf, buf_end, last_block,
5838                                            &from_nchars, &to_nchars);
5839                   if (EQ (trans, Qt))
5840                     break;
5841                   c = *buf;
5842                 }
5843
5844               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5845                 {
5846                   dst = alloc_destination (coding,
5847                                            buf_end - buf
5848                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5849                                            dst);
5850                   dst_end = coding->destination + coding->dst_bytes;
5851                 }
5852
5853               for (i = 0; i < to_nchars; i++)
5854                 {
5855                   if (i > 0)
5856                     c = XINT (AREF (trans, i));
5857                   if (coding->dst_multibyte
5858                       || ! CHAR_BYTE8_P (c))
5859                     CHAR_STRING_ADVANCE (c, dst);
5860                   else
5861                     *dst++ = CHAR_TO_BYTE8 (c);
5862                 }
5863               produced_chars += to_nchars;
5864               *buf++ = to_nchars;
5865               while (--from_nchars > 0)
5866                 *buf++ = 0;
5867             }
5868           else
5869             /* This is an annotation datum.  (-C) is the length.  */
5870             buf += -c;
5871         }
5872       carryover = buf_end - buf;
5873     }
5874   else
5875     {
5876       const unsigned char *src = coding->source;
5877       const unsigned char *src_end = src + coding->src_bytes;
5878       Lisp_Object eol_type;
5879
5880       eol_type = CODING_ID_EOL_TYPE (coding->id);
5881
5882       if (coding->src_multibyte != coding->dst_multibyte)
5883         {
5884           if (coding->src_multibyte)
5885             {
5886               int multibytep = 1;
5887               int consumed_chars;
5888
5889               while (1)
5890                 {
5891                   const unsigned char *src_base = src;
5892                   int c;
5893
5894                   ONE_MORE_BYTE (c);
5895                   if (c == '\r')
5896                     {
5897                       if (EQ (eol_type, Qdos))
5898                         {
5899                           if (src == src_end)
5900                             {
5901                               record_conversion_result
5902                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5903                               goto no_more_source;
5904                             }
5905                           if (*src == '\n')
5906                             c = *src++;
5907                         }
5908                       else if (EQ (eol_type, Qmac))
5909                         c = '\n';
5910                     }
5911                   if (dst == dst_end)
5912                     {
5913                       coding->consumed = src - coding->source;
5914
5915                     if (EQ (coding->src_object, coding->dst_object))
5916                       dst_end = (unsigned char *) src;
5917                     if (dst == dst_end)
5918                       {
5919                         dst = alloc_destination (coding, src_end - src + 1,
5920                                                  dst);
5921                         dst_end = coding->destination + coding->dst_bytes;
5922                         coding_set_source (coding);
5923                         src = coding->source + coding->consumed;
5924                         src_end = coding->source + coding->src_bytes;
5925                       }
5926                     }
5927                   *dst++ = c;
5928                   produced_chars++;
5929                 }
5930             no_more_source:
5931               ;
5932             }
5933           else
5934             while (src < src_end)
5935               {
5936                 int multibytep = 1;
5937                 int c = *src++;
5938
5939                 if (c == '\r')
5940                   {
5941                     if (EQ (eol_type, Qdos))
5942                       {
5943                         if (src < src_end
5944                             && *src == '\n')
5945                           c = *src++;
5946                       }
5947                     else if (EQ (eol_type, Qmac))
5948                       c = '\n';
5949                   }
5950                 if (dst >= dst_end - 1)
5951                   {
5952                     coding->consumed = src - coding->source;
5953
5954                     if (EQ (coding->src_object, coding->dst_object))
5955                       dst_end = (unsigned char *) src;
5956                     if (dst >= dst_end - 1)
5957                       {
5958                         dst = alloc_destination (coding, src_end - src + 2,
5959                                                  dst);
5960                         dst_end = coding->destination + coding->dst_bytes;
5961                         coding_set_source (coding);
5962                         src = coding->source + coding->consumed;
5963                         src_end = coding->source + coding->src_bytes;
5964                       }
5965                   }
5966                 EMIT_ONE_BYTE (c);
5967               }
5968         }
5969       else
5970         {
5971           if (!EQ (coding->src_object, coding->dst_object))
5972             {
5973               int require = coding->src_bytes - coding->dst_bytes;
5974
5975               if (require > 0)
5976                 {
5977                   EMACS_INT offset = src - coding->source;
5978
5979                   dst = alloc_destination (coding, require, dst);
5980                   coding_set_source (coding);
5981                   src = coding->source + offset;
5982                   src_end = coding->source + coding->src_bytes;
5983                 }
5984             }
5985           produced_chars = coding->src_chars;
5986           while (src < src_end)
5987             {
5988               int c = *src++;
5989
5990               if (c == '\r')
5991                 {
5992                   if (EQ (eol_type, Qdos))
5993                     {
5994                       if (src < src_end
5995                           && *src == '\n')
5996                         c = *src++;
5997                       produced_chars--;
5998                     }
5999                   else if (EQ (eol_type, Qmac))
6000                     c = '\n';
6001                 }
6002               *dst++ = c;
6003             }
6004         }
6005       coding->consumed = coding->src_bytes;
6006       coding->consumed_char = coding->src_chars;
6007     }
6008
6009   produced = dst - (coding->destination + coding->produced);
6010   if (BUFFERP (coding->dst_object))
6011     insert_from_gap (produced_chars, produced);
6012   coding->produced += produced;
6013   coding->produced_char += produced_chars;
6014   return carryover;
6015 }
6016
6017 /* Compose text in CODING->object according to the annotation data at
6018    CHARBUF.  CHARBUF is an array:
6019      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
6020  */
6021
6022 static INLINE void
6023 produce_composition (coding, charbuf, pos)
6024      struct coding_system *coding;
6025      int *charbuf;
6026      EMACS_INT pos;
6027 {
6028   int len;
6029   EMACS_INT to;
6030   enum composition_method method;
6031   Lisp_Object components;
6032
6033   len = -charbuf[0];
6034   to = pos + charbuf[2];
6035   if (to <= pos)
6036     return;
6037   method = (enum composition_method) (charbuf[3]);
6038
6039   if (method == COMPOSITION_RELATIVE)
6040     components = Qnil;
6041   else if (method >= COMPOSITION_WITH_RULE
6042            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6043     {
6044       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6045       int i;
6046
6047       len -= 4;
6048       charbuf += 4;
6049       for (i = 0; i < len; i++)
6050         {
6051           args[i] = make_number (charbuf[i]);
6052           if (charbuf[i] < 0)
6053             return;
6054         }
6055       components = (method == COMPOSITION_WITH_ALTCHARS
6056                     ? Fstring (len, args) : Fvector (len, args));
6057     }
6058   else
6059     return;
6060   compose_text (pos, to, components, Qnil, coding->dst_object);
6061 }
6062
6063
6064 /* Put `charset' property on text in CODING->object according to
6065    the annotation data at CHARBUF.  CHARBUF is an array:
6066      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6067  */
6068
6069 static INLINE void
6070 produce_charset (coding, charbuf, pos)
6071      struct coding_system *coding;
6072      int *charbuf;
6073      EMACS_INT pos;
6074 {
6075   EMACS_INT from = pos - charbuf[2];
6076   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6077
6078   Fput_text_property (make_number (from), make_number (pos),
6079                       Qcharset, CHARSET_NAME (charset),
6080                       coding->dst_object);
6081 }
6082
6083
6084 #define CHARBUF_SIZE 0x4000
6085
6086 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6087   do {                                                                  \
6088     int size = CHARBUF_SIZE;;                                           \
6089                                                                         \
6090     coding->charbuf = NULL;                                             \
6091     while (size > 1024)                                                 \
6092       {                                                                 \
6093         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6094         if (coding->charbuf)                                            \
6095           break;                                                        \
6096         size >>= 1;                                                     \
6097       }                                                                 \
6098     if (! coding->charbuf)                                              \
6099       {                                                                 \
6100         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6101         return coding->result;                                          \
6102       }                                                                 \
6103     coding->charbuf_size = size;                                        \
6104   } while (0)
6105
6106
6107 static void
6108 produce_annotation (coding, pos)
6109      struct coding_system *coding;
6110      EMACS_INT pos;
6111 {
6112   int *charbuf = coding->charbuf;
6113   int *charbuf_end = charbuf + coding->charbuf_used;
6114
6115   if (NILP (coding->dst_object))
6116     return;
6117
6118   while (charbuf < charbuf_end)
6119     {
6120       if (*charbuf >= 0)
6121         pos += *charbuf++;
6122       else
6123         {
6124           int len = -*charbuf;
6125           switch (charbuf[1])
6126             {
6127             case CODING_ANNOTATE_COMPOSITION_MASK:
6128               produce_composition (coding, charbuf, pos);
6129               break;
6130             case CODING_ANNOTATE_CHARSET_MASK:
6131               produce_charset (coding, charbuf, pos);
6132               break;
6133             default:
6134               abort ();
6135             }
6136           charbuf += len;
6137         }
6138     }
6139 }
6140
6141 /* Decode the data at CODING->src_object into CODING->dst_object.
6142    CODING->src_object is a buffer, a string, or nil.
6143    CODING->dst_object is a buffer.
6144
6145    If CODING->src_object is a buffer, it must be the current buffer.
6146    In this case, if CODING->src_pos is positive, it is a position of
6147    the source text in the buffer, otherwise, the source text is in the
6148    gap area of the buffer, and CODING->src_pos specifies the offset of
6149    the text from GPT (which must be the same as PT).  If this is the
6150    same buffer as CODING->dst_object, CODING->src_pos must be
6151    negative.
6152
6153    If CODING->src_object is a string, CODING->src_pos is an index to
6154    that string.
6155
6156    If CODING->src_object is nil, CODING->source must already point to
6157    the non-relocatable memory area.  In this case, CODING->src_pos is
6158    an offset from CODING->source.
6159
6160    The decoded data is inserted at the current point of the buffer
6161    CODING->dst_object.
6162 */
6163
6164 static int
6165 decode_coding (coding)
6166      struct coding_system *coding;
6167 {
6168   Lisp_Object attrs;
6169   Lisp_Object undo_list;
6170   Lisp_Object translation_table;
6171   int carryover;
6172   int i;
6173
6174   if (BUFFERP (coding->src_object)
6175       && coding->src_pos > 0
6176       && coding->src_pos < GPT
6177       && coding->src_pos + coding->src_chars > GPT)
6178     move_gap_both (coding->src_pos, coding->src_pos_byte);
6179
6180   undo_list = Qt;
6181   if (BUFFERP (coding->dst_object))
6182     {
6183       if (current_buffer != XBUFFER (coding->dst_object))
6184         set_buffer_internal (XBUFFER (coding->dst_object));
6185       if (GPT != PT)
6186         move_gap_both (PT, PT_BYTE);
6187       undo_list = current_buffer->undo_list;
6188       current_buffer->undo_list = Qt;
6189     }
6190
6191   coding->consumed = coding->consumed_char = 0;
6192   coding->produced = coding->produced_char = 0;
6193   coding->chars_at_source = 0;
6194   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6195   coding->errors = 0;
6196
6197   ALLOC_CONVERSION_WORK_AREA (coding);
6198
6199   attrs = CODING_ID_ATTRS (coding->id);
6200   translation_table = get_translation_table (attrs, 0, NULL);
6201
6202   carryover = 0;
6203   do
6204     {
6205       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6206
6207       coding_set_source (coding);
6208       coding->annotated = 0;
6209       coding->charbuf_used = carryover;
6210       (*(coding->decoder)) (coding);
6211       coding_set_destination (coding);
6212       carryover = produce_chars (coding, translation_table, 0);
6213       if (coding->annotated)
6214         produce_annotation (coding, pos);
6215       for (i = 0; i < carryover; i++)
6216         coding->charbuf[i]
6217           = coding->charbuf[coding->charbuf_used - carryover + i];
6218     }
6219   while (coding->consumed < coding->src_bytes
6220          && (coding->result == CODING_RESULT_SUCCESS
6221              || coding->result == CODING_RESULT_INVALID_SRC));
6222
6223   if (carryover > 0)
6224     {
6225       coding_set_destination (coding);
6226       coding->charbuf_used = carryover;
6227       produce_chars (coding, translation_table, 1);
6228     }
6229
6230   coding->carryover_bytes = 0;
6231   if (coding->consumed < coding->src_bytes)
6232     {
6233       int nbytes = coding->src_bytes - coding->consumed;
6234       const unsigned char *src;
6235
6236       coding_set_source (coding);
6237       coding_set_destination (coding);
6238       src = coding->source + coding->consumed;
6239
6240       if (coding->mode & CODING_MODE_LAST_BLOCK)
6241         {
6242           /* Flush out unprocessed data as binary chars.  We are sure
6243              that the number of data is less than the size of
6244              coding->charbuf.  */
6245           coding->charbuf_used = 0;
6246           while (nbytes-- > 0)
6247             {
6248               int c = *src++;
6249
6250               if (c & 0x80)
6251                 c = BYTE8_TO_CHAR (c);
6252               coding->charbuf[coding->charbuf_used++] = c;
6253             }
6254           produce_chars (coding, Qnil, 1);
6255         }
6256       else
6257         {
6258           /* Record unprocessed bytes in coding->carryover.  We are
6259              sure that the number of data is less than the size of
6260              coding->carryover.  */
6261           unsigned char *p = coding->carryover;
6262
6263           coding->carryover_bytes = nbytes;
6264           while (nbytes-- > 0)
6265             *p++ = *src++;
6266         }
6267       coding->consumed = coding->src_bytes;
6268     }
6269
6270   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6271     decode_eol (coding);
6272   if (BUFFERP (coding->dst_object))
6273     {
6274       current_buffer->undo_list = undo_list;
6275       record_insert (coding->dst_pos, coding->produced_char);
6276     }
6277   return coding->result;
6278 }
6279
6280
6281 /* Extract an annotation datum from a composition starting at POS and
6282    ending before LIMIT of CODING->src_object (buffer or string), store
6283    the data in BUF, set *STOP to a starting position of the next
6284    composition (if any) or to LIMIT, and return the address of the
6285    next element of BUF.
6286
6287    If such an annotation is not found, set *STOP to a starting
6288    position of a composition after POS (if any) or to LIMIT, and
6289    return BUF.  */
6290
6291 static INLINE int *
6292 handle_composition_annotation (pos, limit, coding, buf, stop)
6293      EMACS_INT pos, limit;
6294      struct coding_system *coding;
6295      int *buf;
6296      EMACS_INT *stop;
6297 {
6298   EMACS_INT start, end;
6299   Lisp_Object prop;
6300
6301   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6302       || end > limit)
6303     *stop = limit;
6304   else if (start > pos)
6305     *stop = start;
6306   else
6307     {
6308       if (start == pos)
6309         {
6310           /* We found a composition.  Store the corresponding
6311              annotation data in BUF.  */
6312           int *head = buf;
6313           enum composition_method method = COMPOSITION_METHOD (prop);
6314           int nchars = COMPOSITION_LENGTH (prop);
6315
6316           ADD_COMPOSITION_DATA (buf, nchars, method);
6317           if (method != COMPOSITION_RELATIVE)
6318             {
6319               Lisp_Object components;
6320               int len, i, i_byte;
6321
6322               components = COMPOSITION_COMPONENTS (prop);
6323               if (VECTORP (components))
6324                 {
6325                   len = XVECTOR (components)->size;
6326                   for (i = 0; i < len; i++)
6327                     *buf++ = XINT (AREF (components, i));
6328                 }
6329               else if (STRINGP (components))
6330                 {
6331                   len = SCHARS (components);
6332                   i = i_byte = 0;
6333                   while (i < len)
6334                     {
6335                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6336                       buf++;
6337                     }
6338                 }
6339               else if (INTEGERP (components))
6340                 {
6341                   len = 1;
6342                   *buf++ = XINT (components);
6343                 }
6344               else if (CONSP (components))
6345                 {
6346                   for (len = 0; CONSP (components);
6347                        len++, components = XCDR (components))
6348                     *buf++ = XINT (XCAR (components));
6349                 }
6350               else
6351                 abort ();
6352               *head -= len;
6353             }
6354         }
6355
6356       if (find_composition (end, limit, &start, &end, &prop,
6357                             coding->src_object)
6358           && end <= limit)
6359         *stop = start;
6360       else
6361         *stop = limit;
6362     }
6363   return buf;
6364 }
6365
6366
6367 /* Extract an annotation datum from a text property `charset' at POS of
6368    CODING->src_object (buffer of string), store the data in BUF, set
6369    *STOP to the position where the value of `charset' property changes
6370    (limiting by LIMIT), and return the address of the next element of
6371    BUF.
6372
6373    If the property value is nil, set *STOP to the position where the
6374    property value is non-nil (limiting by LIMIT), and return BUF.  */
6375
6376 static INLINE int *
6377 handle_charset_annotation (pos, limit, coding, buf, stop)
6378      EMACS_INT pos, limit;
6379      struct coding_system *coding;
6380      int *buf;
6381      EMACS_INT *stop;
6382 {
6383   Lisp_Object val, next;
6384   int id;
6385
6386   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6387   if (! NILP (val) && CHARSETP (val))
6388     id = XINT (CHARSET_SYMBOL_ID (val));
6389   else
6390     id = -1;
6391   ADD_CHARSET_DATA (buf, 0, id);
6392   next = Fnext_single_property_change (make_number (pos), Qcharset,
6393                                        coding->src_object,
6394                                        make_number (limit));
6395   *stop = XINT (next);
6396   return buf;
6397 }
6398
6399
6400 static void
6401 consume_chars (coding, translation_table, max_lookup)
6402      struct coding_system *coding;
6403      Lisp_Object translation_table;
6404      int max_lookup;
6405 {
6406   int *buf = coding->charbuf;
6407   int *buf_end = coding->charbuf + coding->charbuf_size;
6408   const unsigned char *src = coding->source + coding->consumed;
6409   const unsigned char *src_end = coding->source + coding->src_bytes;
6410   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6411   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6412   int multibytep = coding->src_multibyte;
6413   Lisp_Object eol_type;
6414   int c;
6415   EMACS_INT stop, stop_composition, stop_charset;
6416   int *lookup_buf = NULL;
6417
6418   if (! NILP (translation_table))
6419     lookup_buf = alloca (sizeof (int) * max_lookup);
6420
6421   eol_type = CODING_ID_EOL_TYPE (coding->id);
6422   if (VECTORP (eol_type))
6423     eol_type = Qunix;
6424
6425   /* Note: composition handling is not yet implemented.  */
6426   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6427
6428   if (NILP (coding->src_object))
6429     stop = stop_composition = stop_charset = end_pos;
6430   else
6431     {
6432       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6433         stop = stop_composition = pos;
6434       else
6435         stop = stop_composition = end_pos;
6436       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6437         stop = stop_charset = pos;
6438       else
6439         stop_charset = end_pos;
6440     }
6441
6442   /* Compensate for CRLF and conversion.  */
6443   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6444   while (buf < buf_end)
6445     {
6446       Lisp_Object trans;
6447
6448       if (pos == stop)
6449         {
6450           if (pos == end_pos)
6451             break;
6452           if (pos == stop_composition)
6453             buf = handle_composition_annotation (pos, end_pos, coding,
6454                                                  buf, &stop_composition);
6455           if (pos == stop_charset)
6456             buf = handle_charset_annotation (pos, end_pos, coding,
6457                                              buf, &stop_charset);
6458           stop = (stop_composition < stop_charset
6459                   ? stop_composition : stop_charset);
6460         }
6461
6462       if (! multibytep)
6463         {
6464           EMACS_INT bytes;
6465
6466           if (coding->encoder == encode_coding_raw_text)
6467             c = *src++, pos++;
6468           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6469             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6470           else
6471             c = BYTE8_TO_CHAR (*src), src++, pos++;
6472         }
6473       else
6474         c = STRING_CHAR_ADVANCE (src), pos++;
6475       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6476         c = '\n';
6477       if (! EQ (eol_type, Qunix))
6478         {
6479           if (c == '\n')
6480             {
6481               if (EQ (eol_type, Qdos))
6482                 *buf++ = '\r';
6483               else
6484                 c = '\r';
6485             }
6486         }
6487
6488       trans = Qnil;
6489       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6490       if (NILP (trans))
6491         *buf++ = c;
6492       else
6493         {
6494           int from_nchars = 1, to_nchars = 1;
6495           int *lookup_buf_end;
6496           const unsigned char *p = src;
6497           int i;
6498
6499           lookup_buf[0] = c;
6500           for (i = 1; i < max_lookup && p < src_end; i++)
6501             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6502           lookup_buf_end = lookup_buf + i;
6503           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6504                                    &from_nchars, &to_nchars);
6505           if (EQ (trans, Qt)
6506               || buf + to_nchars > buf_end)
6507             break;
6508           *buf++ = *lookup_buf;
6509           for (i = 1; i < to_nchars; i++)
6510             *buf++ = XINT (AREF (trans, i));
6511           for (i = 1; i < from_nchars; i++, pos++)
6512             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6513         }
6514     }
6515
6516   coding->consumed = src - coding->source;
6517   coding->consumed_char = pos - coding->src_pos;
6518   coding->charbuf_used = buf - coding->charbuf;
6519   coding->chars_at_source = 0;
6520 }
6521
6522
6523 /* Encode the text at CODING->src_object into CODING->dst_object.
6524    CODING->src_object is a buffer or a string.
6525    CODING->dst_object is a buffer or nil.
6526
6527    If CODING->src_object is a buffer, it must be the current buffer.
6528    In this case, if CODING->src_pos is positive, it is a position of
6529    the source text in the buffer, otherwise. the source text is in the
6530    gap area of the buffer, and coding->src_pos specifies the offset of
6531    the text from GPT (which must be the same as PT).  If this is the
6532    same buffer as CODING->dst_object, CODING->src_pos must be
6533    negative and CODING should not have `pre-write-conversion'.
6534
6535    If CODING->src_object is a string, CODING should not have
6536    `pre-write-conversion'.
6537
6538    If CODING->dst_object is a buffer, the encoded data is inserted at
6539    the current point of that buffer.
6540
6541    If CODING->dst_object is nil, the encoded data is placed at the
6542    memory area specified by CODING->destination.  */
6543
6544 static int
6545 encode_coding (coding)
6546      struct coding_system *coding;
6547 {
6548   Lisp_Object attrs;
6549   Lisp_Object translation_table;
6550   int max_lookup;
6551
6552   attrs = CODING_ID_ATTRS (coding->id);
6553   if (coding->encoder == encode_coding_raw_text)
6554     translation_table = Qnil, max_lookup = 0;
6555   else
6556     translation_table = get_translation_table (attrs, 1, &max_lookup);
6557
6558   if (BUFFERP (coding->dst_object))
6559     {
6560       set_buffer_internal (XBUFFER (coding->dst_object));
6561       coding->dst_multibyte
6562         = ! NILP (current_buffer->enable_multibyte_characters);
6563     }
6564
6565   coding->consumed = coding->consumed_char = 0;
6566   coding->produced = coding->produced_char = 0;
6567   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6568   coding->errors = 0;
6569
6570   ALLOC_CONVERSION_WORK_AREA (coding);
6571
6572   do {
6573     coding_set_source (coding);
6574     consume_chars (coding, translation_table, max_lookup);
6575     coding_set_destination (coding);
6576     (*(coding->encoder)) (coding);
6577   } while (coding->consumed_char < coding->src_chars);
6578
6579   if (BUFFERP (coding->dst_object))
6580     insert_from_gap (coding->produced_char, coding->produced);
6581
6582   return (coding->result);
6583 }
6584
6585
6586 /* Name (or base name) of work buffer for code conversion.  */
6587 static Lisp_Object Vcode_conversion_workbuf_name;
6588
6589 /* A working buffer used by the top level conversion.  Once it is
6590    created, it is never destroyed.  It has the name
6591    Vcode_conversion_workbuf_name.  The other working buffers are
6592    destroyed after the use is finished, and their names are modified
6593    versions of Vcode_conversion_workbuf_name.  */
6594 static Lisp_Object Vcode_conversion_reused_workbuf;
6595
6596 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6597 static int reused_workbuf_in_use;
6598
6599
6600 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6601    multibyteness of returning buffer.  */
6602
6603 static Lisp_Object
6604 make_conversion_work_buffer (multibyte)
6605      int multibyte;
6606 {
6607   Lisp_Object name, workbuf;
6608   struct buffer *current;
6609
6610   if (reused_workbuf_in_use++)
6611     {
6612       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6613       workbuf = Fget_buffer_create (name);
6614     }
6615   else
6616     {
6617       name = Vcode_conversion_workbuf_name;
6618       workbuf = Fget_buffer_create (name);
6619       if (NILP (Vcode_conversion_reused_workbuf))
6620         Vcode_conversion_reused_workbuf = workbuf;
6621     }
6622   current = current_buffer;
6623   set_buffer_internal (XBUFFER (workbuf));
6624   Ferase_buffer ();
6625   current_buffer->undo_list = Qt;
6626   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6627   set_buffer_internal (current);
6628   return workbuf;
6629 }
6630
6631
6632 static Lisp_Object
6633 code_conversion_restore (arg)
6634      Lisp_Object arg;
6635 {
6636   Lisp_Object current, workbuf;
6637   struct gcpro gcpro1;
6638
6639   GCPRO1 (arg);
6640   current = XCAR (arg);
6641   workbuf = XCDR (arg);
6642   if (! NILP (workbuf))
6643     {
6644       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6645         reused_workbuf_in_use = 0;
6646       else if (! NILP (Fbuffer_live_p (workbuf)))
6647         Fkill_buffer (workbuf);
6648     }
6649   set_buffer_internal (XBUFFER (current));
6650   UNGCPRO;
6651   return Qnil;
6652 }
6653
6654 Lisp_Object
6655 code_conversion_save (with_work_buf, multibyte)
6656      int with_work_buf, multibyte;
6657 {
6658   Lisp_Object workbuf = Qnil;
6659
6660   if (with_work_buf)
6661     workbuf = make_conversion_work_buffer (multibyte);
6662   record_unwind_protect (code_conversion_restore,
6663                          Fcons (Fcurrent_buffer (), workbuf));
6664   return workbuf;
6665 }
6666
6667 int
6668 decode_coding_gap (coding, chars, bytes)
6669      struct coding_system *coding;
6670      EMACS_INT chars, bytes;
6671 {
6672   int count = specpdl_ptr - specpdl;
6673   Lisp_Object attrs;
6674
6675   code_conversion_save (0, 0);
6676
6677   coding->src_object = Fcurrent_buffer ();
6678   coding->src_chars = chars;
6679   coding->src_bytes = bytes;
6680   coding->src_pos = -chars;
6681   coding->src_pos_byte = -bytes;
6682   coding->src_multibyte = chars < bytes;
6683   coding->dst_object = coding->src_object;
6684   coding->dst_pos = PT;
6685   coding->dst_pos_byte = PT_BYTE;
6686   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6687
6688   if (CODING_REQUIRE_DETECTION (coding))
6689     detect_coding (coding);
6690
6691   coding->mode |= CODING_MODE_LAST_BLOCK;
6692   decode_coding (coding);
6693
6694   attrs = CODING_ID_ATTRS (coding->id);
6695   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6696     {
6697       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6698       Lisp_Object val;
6699
6700       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6701       val = call1 (CODING_ATTR_POST_READ (attrs),
6702                    make_number (coding->produced_char));
6703       CHECK_NATNUM (val);
6704       coding->produced_char += Z - prev_Z;
6705       coding->produced += Z_BYTE - prev_Z_BYTE;
6706     }
6707
6708   unbind_to (count, Qnil);
6709   return coding->result;
6710 }
6711
6712 int
6713 encode_coding_gap (coding, chars, bytes)
6714      struct coding_system *coding;
6715      EMACS_INT chars, bytes;
6716 {
6717   int count = specpdl_ptr - specpdl;
6718
6719   code_conversion_save (0, 0);
6720
6721   coding->src_object = Fcurrent_buffer ();
6722   coding->src_chars = chars;
6723   coding->src_bytes = bytes;
6724   coding->src_pos = -chars;
6725   coding->src_pos_byte = -bytes;
6726   coding->src_multibyte = chars < bytes;
6727   coding->dst_object = coding->src_object;
6728   coding->dst_pos = PT;
6729   coding->dst_pos_byte = PT_BYTE;
6730
6731   encode_coding (coding);
6732
6733   unbind_to (count, Qnil);
6734   return coding->result;
6735 }
6736
6737
6738 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6739    SRC_OBJECT into DST_OBJECT by coding context CODING.
6740
6741    SRC_OBJECT is a buffer, a string, or Qnil.
6742
6743    If it is a buffer, the text is at point of the buffer.  FROM and TO
6744    are positions in the buffer.
6745
6746    If it is a string, the text is at the beginning of the string.
6747    FROM and TO are indices to the string.
6748
6749    If it is nil, the text is at coding->source.  FROM and TO are
6750    indices to coding->source.
6751
6752    DST_OBJECT is a buffer, Qt, or Qnil.
6753
6754    If it is a buffer, the decoded text is inserted at point of the
6755    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6756    is deleted.
6757
6758    If it is Qt, a string is made from the decoded text, and
6759    set in CODING->dst_object.
6760
6761    If it is Qnil, the decoded text is stored at CODING->destination.
6762    The caller must allocate CODING->dst_bytes bytes at
6763    CODING->destination by xmalloc.  If the decoded text is longer than
6764    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6765  */
6766
6767 void
6768 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6769                       dst_object)
6770      struct coding_system *coding;
6771      Lisp_Object src_object;
6772      EMACS_INT from, from_byte, to, to_byte;
6773      Lisp_Object dst_object;
6774 {
6775   int count = specpdl_ptr - specpdl;
6776   unsigned char *destination;
6777   EMACS_INT dst_bytes;
6778   EMACS_INT chars = to - from;
6779   EMACS_INT bytes = to_byte - from_byte;
6780   Lisp_Object attrs;
6781   Lisp_Object buffer;
6782   int saved_pt = -1, saved_pt_byte;
6783
6784   buffer = Fcurrent_buffer ();
6785
6786   if (NILP (dst_object))
6787     {
6788       destination = coding->destination;
6789       dst_bytes = coding->dst_bytes;
6790     }
6791
6792   coding->src_object = src_object;
6793   coding->src_chars = chars;
6794   coding->src_bytes = bytes;
6795   coding->src_multibyte = chars < bytes;
6796
6797   if (STRINGP (src_object))
6798     {
6799       coding->src_pos = from;
6800       coding->src_pos_byte = from_byte;
6801     }
6802   else if (BUFFERP (src_object))
6803     {
6804       set_buffer_internal (XBUFFER (src_object));
6805       if (from != GPT)
6806         move_gap_both (from, from_byte);
6807       if (EQ (src_object, dst_object))
6808         {
6809           saved_pt = PT, saved_pt_byte = PT_BYTE;
6810           TEMP_SET_PT_BOTH (from, from_byte);
6811           del_range_both (from, from_byte, to, to_byte, 1);
6812           coding->src_pos = -chars;
6813           coding->src_pos_byte = -bytes;
6814         }
6815       else
6816         {
6817           coding->src_pos = from;
6818           coding->src_pos_byte = from_byte;
6819         }
6820     }
6821
6822   if (CODING_REQUIRE_DETECTION (coding))
6823     detect_coding (coding);
6824   attrs = CODING_ID_ATTRS (coding->id);
6825
6826   if (EQ (dst_object, Qt)
6827       || (! NILP (CODING_ATTR_POST_READ (attrs))
6828           && NILP (dst_object)))
6829     {
6830       coding->dst_object = code_conversion_save (1, 1);
6831       coding->dst_pos = BEG;
6832       coding->dst_pos_byte = BEG_BYTE;
6833       coding->dst_multibyte = 1;
6834     }
6835   else if (BUFFERP (dst_object))
6836     {
6837       code_conversion_save (0, 0);
6838       coding->dst_object = dst_object;
6839       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6840       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6841       coding->dst_multibyte
6842         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6843     }
6844   else
6845     {
6846       code_conversion_save (0, 0);
6847       coding->dst_object = Qnil;
6848       coding->dst_multibyte = 1;
6849     }
6850
6851   decode_coding (coding);
6852
6853   if (BUFFERP (coding->dst_object))
6854     set_buffer_internal (XBUFFER (coding->dst_object));
6855
6856   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6857     {
6858       struct gcpro gcpro1, gcpro2;
6859       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6860       Lisp_Object val;
6861
6862       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6863       GCPRO2 (coding->src_object, coding->dst_object);
6864       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6865                         make_number (coding->produced_char));
6866       UNGCPRO;
6867       CHECK_NATNUM (val);
6868       coding->produced_char += Z - prev_Z;
6869       coding->produced += Z_BYTE - prev_Z_BYTE;
6870     }
6871
6872   if (EQ (dst_object, Qt))
6873     {
6874       coding->dst_object = Fbuffer_string ();
6875     }
6876   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6877     {
6878       set_buffer_internal (XBUFFER (coding->dst_object));
6879       if (dst_bytes < coding->produced)
6880         {
6881           destination
6882             = (unsigned char *) xrealloc (destination, coding->produced);
6883           if (! destination)
6884             {
6885               record_conversion_result (coding,
6886                                         CODING_RESULT_INSUFFICIENT_DST);
6887               unbind_to (count, Qnil);
6888               return;
6889             }
6890           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6891             move_gap_both (BEGV, BEGV_BYTE);
6892           bcopy (BEGV_ADDR, destination, coding->produced);
6893           coding->destination = destination;
6894         }
6895     }
6896
6897   if (saved_pt >= 0)
6898     {
6899       /* This is the case of:
6900          (BUFFERP (src_object) && EQ (src_object, dst_object))
6901          As we have moved PT while replacing the original buffer
6902          contents, we must recover it now.  */
6903       set_buffer_internal (XBUFFER (src_object));
6904       if (saved_pt < from)
6905         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6906       else if (saved_pt < from + chars)
6907         TEMP_SET_PT_BOTH (from, from_byte);
6908       else if (! NILP (current_buffer->enable_multibyte_characters))
6909         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6910                           saved_pt_byte + (coding->produced - bytes));
6911       else
6912         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6913                           saved_pt_byte + (coding->produced - bytes));
6914     }
6915
6916   unbind_to (count, coding->dst_object);
6917 }
6918
6919
6920 void
6921 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6922                       dst_object)
6923      struct coding_system *coding;
6924      Lisp_Object src_object;
6925      EMACS_INT from, from_byte, to, to_byte;
6926      Lisp_Object dst_object;
6927 {
6928   int count = specpdl_ptr - specpdl;
6929   EMACS_INT chars = to - from;
6930   EMACS_INT bytes = to_byte - from_byte;
6931   Lisp_Object attrs;
6932   Lisp_Object buffer;
6933   int saved_pt = -1, saved_pt_byte;
6934   int kill_src_buffer = 0;
6935
6936   buffer = Fcurrent_buffer ();
6937
6938   coding->src_object = src_object;
6939   coding->src_chars = chars;
6940   coding->src_bytes = bytes;
6941   coding->src_multibyte = chars < bytes;
6942
6943   attrs = CODING_ID_ATTRS (coding->id);
6944
6945   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6946     {
6947       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6948       set_buffer_internal (XBUFFER (coding->src_object));
6949       if (STRINGP (src_object))
6950         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6951       else if (BUFFERP (src_object))
6952         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6953       else
6954         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6955
6956       if (EQ (src_object, dst_object))
6957         {
6958           set_buffer_internal (XBUFFER (src_object));
6959           saved_pt = PT, saved_pt_byte = PT_BYTE;
6960           del_range_both (from, from_byte, to, to_byte, 1);
6961           set_buffer_internal (XBUFFER (coding->src_object));
6962         }
6963
6964       {
6965         Lisp_Object args[3];
6966
6967         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6968         args[1] = make_number (BEG);
6969         args[2] = make_number (Z);
6970         safe_call (3, args);
6971       }
6972       if (XBUFFER (coding->src_object) != current_buffer)
6973         kill_src_buffer = 1;
6974       coding->src_object = Fcurrent_buffer ();
6975       if (BEG != GPT)
6976         move_gap_both (BEG, BEG_BYTE);
6977       coding->src_chars = Z - BEG;
6978       coding->src_bytes = Z_BYTE - BEG_BYTE;
6979       coding->src_pos = BEG;
6980       coding->src_pos_byte = BEG_BYTE;
6981       coding->src_multibyte = Z < Z_BYTE;
6982     }
6983   else if (STRINGP (src_object))
6984     {
6985       code_conversion_save (0, 0);
6986       coding->src_pos = from;
6987       coding->src_pos_byte = from_byte;
6988     }
6989   else if (BUFFERP (src_object))
6990     {
6991       code_conversion_save (0, 0);
6992       set_buffer_internal (XBUFFER (src_object));
6993       if (EQ (src_object, dst_object))
6994         {
6995           saved_pt = PT, saved_pt_byte = PT_BYTE;
6996           coding->src_object = del_range_1 (from, to, 1, 1);
6997           coding->src_pos = 0;
6998           coding->src_pos_byte = 0;
6999         }
7000       else
7001         {
7002           if (from < GPT && to >= GPT)
7003             move_gap_both (from, from_byte);
7004           coding->src_pos = from;
7005           coding->src_pos_byte = from_byte;
7006         }
7007     }
7008   else
7009     code_conversion_save (0, 0);
7010
7011   if (BUFFERP (dst_object))
7012     {
7013       coding->dst_object = dst_object;
7014       if (EQ (src_object, dst_object))
7015         {
7016           coding->dst_pos = from;
7017           coding->dst_pos_byte = from_byte;
7018         }
7019       else
7020         {
7021           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
7022           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
7023         }
7024       coding->dst_multibyte
7025         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
7026     }
7027   else if (EQ (dst_object, Qt))
7028     {
7029       coding->dst_object = Qnil;
7030       coding->dst_bytes = coding->src_chars;
7031       if (coding->dst_bytes == 0)
7032         coding->dst_bytes = 1;
7033       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7034       coding->dst_multibyte = 0;
7035     }
7036   else
7037     {
7038       coding->dst_object = Qnil;
7039       coding->dst_multibyte = 0;
7040     }
7041
7042   encode_coding (coding);
7043
7044   if (EQ (dst_object, Qt))
7045     {
7046       if (BUFFERP (coding->dst_object))
7047         coding->dst_object = Fbuffer_string ();
7048       else
7049         {
7050           coding->dst_object
7051             = make_unibyte_string ((char *) coding->destination,
7052                                    coding->produced);
7053           xfree (coding->destination);
7054         }
7055     }
7056
7057   if (saved_pt >= 0)
7058     {
7059       /* This is the case of:
7060          (BUFFERP (src_object) && EQ (src_object, dst_object))
7061          As we have moved PT while replacing the original buffer
7062          contents, we must recover it now.  */
7063       set_buffer_internal (XBUFFER (src_object));
7064       if (saved_pt < from)
7065         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7066       else if (saved_pt < from + chars)
7067         TEMP_SET_PT_BOTH (from, from_byte);
7068       else if (! NILP (current_buffer->enable_multibyte_characters))
7069         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7070                           saved_pt_byte + (coding->produced - bytes));
7071       else
7072         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7073                           saved_pt_byte + (coding->produced - bytes));
7074     }
7075
7076   if (kill_src_buffer)
7077     Fkill_buffer (coding->src_object);
7078   unbind_to (count, Qnil);
7079 }
7080
7081
7082 Lisp_Object
7083 preferred_coding_system ()
7084 {
7085   int id = coding_categories[coding_priorities[0]].id;
7086
7087   return CODING_ID_NAME (id);
7088 }
7089
7090 \f
7091 #ifdef emacs
7092 /*** 8. Emacs Lisp library functions ***/
7093
7094 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7095        doc: /* Return t if OBJECT is nil or a coding-system.
7096 See the documentation of `define-coding-system' for information
7097 about coding-system objects.  */)
7098      (obj)
7099      Lisp_Object obj;
7100 {
7101   if (NILP (obj)
7102       || CODING_SYSTEM_ID (obj) >= 0)
7103     return Qt;
7104   if (! SYMBOLP (obj)
7105       || NILP (Fget (obj, Qcoding_system_define_form)))
7106     return Qnil;
7107   return Qt;
7108 }
7109
7110 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7111        Sread_non_nil_coding_system, 1, 1, 0,
7112        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7113      (prompt)
7114      Lisp_Object prompt;
7115 {
7116   Lisp_Object val;
7117   do
7118     {
7119       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7120                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7121     }
7122   while (SCHARS (val) == 0);
7123   return (Fintern (val, Qnil));
7124 }
7125
7126 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7127        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7128 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7129      (prompt, default_coding_system)
7130      Lisp_Object prompt, default_coding_system;
7131 {
7132   Lisp_Object val;
7133   if (SYMBOLP (default_coding_system))
7134     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7135   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7136                           Qt, Qnil, Qcoding_system_history,
7137                           default_coding_system, Qnil);
7138   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7139 }
7140
7141 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7142        1, 1, 0,
7143        doc: /* Check validity of CODING-SYSTEM.
7144 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7145 It is valid if it is nil or a symbol defined as a coding system by the
7146 function `define-coding-system'.  */)
7147   (coding_system)
7148      Lisp_Object coding_system;
7149 {
7150   Lisp_Object define_form;
7151
7152   define_form = Fget (coding_system, Qcoding_system_define_form);
7153   if (! NILP (define_form))
7154     {
7155       Fput (coding_system, Qcoding_system_define_form, Qnil);
7156       safe_eval (define_form);
7157     }
7158   if (!NILP (Fcoding_system_p (coding_system)))
7159     return coding_system;
7160   while (1)
7161     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7162 }
7163
7164 \f
7165 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7166    HIGHEST is nonzero, return the coding system of the highest
7167    priority among the detected coding systems.  Otherwize return a
7168    list of detected coding systems sorted by their priorities.  If
7169    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7170    multibyte form but contains only ASCII and eight-bit chars.
7171    Otherwise, the bytes are raw bytes.
7172
7173    CODING-SYSTEM controls the detection as below:
7174
7175    If it is nil, detect both text-format and eol-format.  If the
7176    text-format part of CODING-SYSTEM is already specified
7177    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7178    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7179    detect only text-format.  */
7180
7181 Lisp_Object
7182 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7183                       coding_system)
7184      const unsigned char *src;
7185      int src_chars, src_bytes, highest;
7186      int multibytep;
7187      Lisp_Object coding_system;
7188 {
7189   const unsigned char *src_end = src + src_bytes;
7190   Lisp_Object attrs, eol_type;
7191   Lisp_Object val;
7192   struct coding_system coding;
7193   int id;
7194   struct coding_detection_info detect_info;
7195   enum coding_category base_category;
7196
7197   if (NILP (coding_system))
7198     coding_system = Qundecided;
7199   setup_coding_system (coding_system, &coding);
7200   attrs = CODING_ID_ATTRS (coding.id);
7201   eol_type = CODING_ID_EOL_TYPE (coding.id);
7202   coding_system = CODING_ATTR_BASE_NAME (attrs);
7203
7204   coding.source = src;
7205   coding.src_chars = src_chars;
7206   coding.src_bytes = src_bytes;
7207   coding.src_multibyte = multibytep;
7208   coding.consumed = 0;
7209   coding.mode |= CODING_MODE_LAST_BLOCK;
7210
7211   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7212
7213   /* At first, detect text-format if necessary.  */
7214   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7215   if (base_category == coding_category_undecided)
7216     {
7217       enum coding_category category;
7218       struct coding_system *this;
7219       int c, i;
7220
7221       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7222       for (i = 0; src < src_end; i++, src++)
7223         {
7224           c = *src;
7225           if (c & 0x80)
7226             break;
7227           if (c < 0x20
7228               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7229               && inhibit_iso_escape_detection)
7230             {
7231               coding.head_ascii = src - coding.source;
7232               if (detect_coding_iso_2022 (&coding, &detect_info))
7233                 {
7234                   /* We have scanned the whole data.  */
7235                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7236                     /* We didn't find an 8-bit code.  */
7237                     src = src_end;
7238                   break;
7239                 }
7240             }
7241         }
7242       coding.head_ascii = src - coding.source;
7243
7244       if (src < src_end
7245           || detect_info.found)
7246         {
7247           if (src == src_end)
7248             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7249             for (i = 0; i < coding_category_raw_text; i++)
7250               {
7251                 category = coding_priorities[i];
7252                 if (detect_info.found & (1 << category))
7253                   break;
7254               }
7255           else
7256             for (i = 0; i < coding_category_raw_text; i++)
7257               {
7258                 category = coding_priorities[i];
7259                 this = coding_categories + category;
7260
7261                 if (this->id < 0)
7262                   {
7263                     /* No coding system of this category is defined.  */
7264                     detect_info.rejected |= (1 << category);
7265                   }
7266                 else if (category >= coding_category_raw_text)
7267                   continue;
7268                 else if (detect_info.checked & (1 << category))
7269                   {
7270                     if (highest
7271                         && (detect_info.found & (1 << category)))
7272                       break;
7273                   }
7274                 else
7275                   {
7276                     if ((*(this->detector)) (&coding, &detect_info)
7277                         && highest
7278                         && (detect_info.found & (1 << category)))
7279                       {
7280                         if (category == coding_category_utf_16_auto)
7281                           {
7282                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7283                               category = coding_category_utf_16_le;
7284                             else
7285                               category = coding_category_utf_16_be;
7286                           }
7287                         break;
7288                       }
7289                   }
7290               }
7291         }
7292
7293       if (detect_info.rejected == CATEGORY_MASK_ANY)
7294         {
7295           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7296           id = coding_categories[coding_category_raw_text].id;
7297           val = Fcons (make_number (id), Qnil);
7298         }
7299       else if (! detect_info.rejected && ! detect_info.found)
7300         {
7301           detect_info.found = CATEGORY_MASK_ANY;
7302           id = coding_categories[coding_category_undecided].id;
7303           val = Fcons (make_number (id), Qnil);
7304         }
7305       else if (highest)
7306         {
7307           if (detect_info.found)
7308             {
7309               detect_info.found = 1 << category;
7310               val = Fcons (make_number (this->id), Qnil);
7311             }
7312           else
7313             for (i = 0; i < coding_category_raw_text; i++)
7314               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7315                 {
7316                   detect_info.found = 1 << coding_priorities[i];
7317                   id = coding_categories[coding_priorities[i]].id;
7318                   val = Fcons (make_number (id), Qnil);
7319                   break;
7320                 }
7321         }
7322       else
7323         {
7324           int mask = detect_info.rejected | detect_info.found;
7325           int found = 0;
7326           val = Qnil;
7327
7328           for (i = coding_category_raw_text - 1; i >= 0; i--)
7329             {
7330               category = coding_priorities[i];
7331               if (! (mask & (1 << category)))
7332                 {
7333                   found |= 1 << category;
7334                   id = coding_categories[category].id;
7335                   val = Fcons (make_number (id), val);
7336                 }
7337             }
7338           for (i = coding_category_raw_text - 1; i >= 0; i--)
7339             {
7340               category = coding_priorities[i];
7341               if (detect_info.found & (1 << category))
7342                 {
7343                   id = coding_categories[category].id;
7344                   val = Fcons (make_number (id), val);
7345                 }
7346             }
7347           detect_info.found |= found;
7348         }
7349     }
7350   else if (base_category == coding_category_utf_16_auto)
7351     {
7352       if (detect_coding_utf_16 (&coding, &detect_info))
7353         {
7354           struct coding_system *this;
7355
7356           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7357             this = coding_categories + coding_category_utf_16_le;
7358           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7359             this = coding_categories + coding_category_utf_16_be;
7360           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7361             this = coding_categories + coding_category_utf_16_be_nosig;
7362           else
7363             this = coding_categories + coding_category_utf_16_le_nosig;
7364           val = Fcons (make_number (this->id), Qnil);
7365         }
7366     }
7367   else
7368     {
7369       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7370       val = Fcons (make_number (coding.id), Qnil);
7371     }
7372
7373   /* Then, detect eol-format if necessary.  */
7374   {
7375     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7376     Lisp_Object tail;
7377
7378     if (VECTORP (eol_type))
7379       {
7380         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7381           normal_eol = detect_eol (coding.source, src_bytes,
7382                                    coding_category_raw_text);
7383         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7384                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7385           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7386                                       coding_category_utf_16_be);
7387         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7388                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7389           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7390                                       coding_category_utf_16_le);
7391       }
7392     else
7393       {
7394         if (EQ (eol_type, Qunix))
7395           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7396         else if (EQ (eol_type, Qdos))
7397           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7398         else
7399           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7400       }
7401
7402     for (tail = val; CONSP (tail); tail = XCDR (tail))
7403       {
7404         enum coding_category category;
7405         int this_eol;
7406
7407         id = XINT (XCAR (tail));
7408         attrs = CODING_ID_ATTRS (id);
7409         category = XINT (CODING_ATTR_CATEGORY (attrs));
7410         eol_type = CODING_ID_EOL_TYPE (id);
7411         if (VECTORP (eol_type))
7412           {
7413             if (category == coding_category_utf_16_be
7414                 || category == coding_category_utf_16_be_nosig)
7415               this_eol = utf_16_be_eol;
7416             else if (category == coding_category_utf_16_le
7417                      || category == coding_category_utf_16_le_nosig)
7418               this_eol = utf_16_le_eol;
7419             else
7420               this_eol = normal_eol;
7421
7422             if (this_eol == EOL_SEEN_LF)
7423               XSETCAR (tail, AREF (eol_type, 0));
7424             else if (this_eol == EOL_SEEN_CRLF)
7425               XSETCAR (tail, AREF (eol_type, 1));
7426             else if (this_eol == EOL_SEEN_CR)
7427               XSETCAR (tail, AREF (eol_type, 2));
7428             else
7429               XSETCAR (tail, CODING_ID_NAME (id));
7430           }
7431         else
7432           XSETCAR (tail, CODING_ID_NAME (id));
7433       }
7434   }
7435
7436   return (highest ? XCAR (val) : val);
7437 }
7438
7439
7440 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7441        2, 3, 0,
7442        doc: /* Detect coding system of the text in the region between START and END.
7443 Return a list of possible coding systems ordered by priority.
7444
7445 If only ASCII characters are found, it returns a list of single element
7446 `undecided' or its subsidiary coding system according to a detected
7447 end-of-line format.
7448
7449 If optional argument HIGHEST is non-nil, return the coding system of
7450 highest priority.  */)
7451      (start, end, highest)
7452      Lisp_Object start, end, highest;
7453 {
7454   int from, to;
7455   int from_byte, to_byte;
7456
7457   CHECK_NUMBER_COERCE_MARKER (start);
7458   CHECK_NUMBER_COERCE_MARKER (end);
7459
7460   validate_region (&start, &end);
7461   from = XINT (start), to = XINT (end);
7462   from_byte = CHAR_TO_BYTE (from);
7463   to_byte = CHAR_TO_BYTE (to);
7464
7465   if (from < GPT && to >= GPT)
7466     move_gap_both (to, to_byte);
7467
7468   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7469                                to - from, to_byte - from_byte,
7470                                !NILP (highest),
7471                                !NILP (current_buffer
7472                                       ->enable_multibyte_characters),
7473                                Qnil);
7474 }
7475
7476 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7477        1, 2, 0,
7478        doc: /* Detect coding system of the text in STRING.
7479 Return a list of possible coding systems ordered by priority.
7480
7481 If only ASCII characters are found, it returns a list of single element
7482 `undecided' or its subsidiary coding system according to a detected
7483 end-of-line format.
7484
7485 If optional argument HIGHEST is non-nil, return the coding system of
7486 highest priority.  */)
7487      (string, highest)
7488      Lisp_Object string, highest;
7489 {
7490   CHECK_STRING (string);
7491
7492   return detect_coding_system (SDATA (string),
7493                                SCHARS (string), SBYTES (string),
7494                                !NILP (highest), STRING_MULTIBYTE (string),
7495                                Qnil);
7496 }
7497
7498
7499 static INLINE int
7500 char_encodable_p (c, attrs)
7501      int c;
7502      Lisp_Object attrs;
7503 {
7504   Lisp_Object tail;
7505   struct charset *charset;
7506   Lisp_Object translation_table;
7507
7508   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7509   if (! NILP (translation_table))
7510     c = translate_char (translation_table, c);
7511   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7512        CONSP (tail); tail = XCDR (tail))
7513     {
7514       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7515       if (CHAR_CHARSET_P (c, charset))
7516         break;
7517     }
7518   return (! NILP (tail));
7519 }
7520
7521
7522 /* Return a list of coding systems that safely encode the text between
7523    START and END.  If EXCLUDE is non-nil, it is a list of coding
7524    systems not to check.  The returned list doesn't contain any such
7525    coding systems.  In any case, if the text contains only ASCII or is
7526    unibyte, return t.  */
7527
7528 DEFUN ("find-coding-systems-region-internal",
7529        Ffind_coding_systems_region_internal,
7530        Sfind_coding_systems_region_internal, 2, 3, 0,
7531        doc: /* Internal use only.  */)
7532      (start, end, exclude)
7533      Lisp_Object start, end, exclude;
7534 {
7535   Lisp_Object coding_attrs_list, safe_codings;
7536   EMACS_INT start_byte, end_byte;
7537   const unsigned char *p, *pbeg, *pend;
7538   int c;
7539   Lisp_Object tail, elt;
7540
7541   if (STRINGP (start))
7542     {
7543       if (!STRING_MULTIBYTE (start)
7544           || SCHARS (start) == SBYTES (start))
7545         return Qt;
7546       start_byte = 0;
7547       end_byte = SBYTES (start);
7548     }
7549   else
7550     {
7551       CHECK_NUMBER_COERCE_MARKER (start);
7552       CHECK_NUMBER_COERCE_MARKER (end);
7553       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7554         args_out_of_range (start, end);
7555       if (NILP (current_buffer->enable_multibyte_characters))
7556         return Qt;
7557       start_byte = CHAR_TO_BYTE (XINT (start));
7558       end_byte = CHAR_TO_BYTE (XINT (end));
7559       if (XINT (end) - XINT (start) == end_byte - start_byte)
7560         return Qt;
7561
7562       if (XINT (start) < GPT && XINT (end) > GPT)
7563         {
7564           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7565             move_gap_both (XINT (start), start_byte);
7566           else
7567             move_gap_both (XINT (end), end_byte);
7568         }
7569     }
7570
7571   coding_attrs_list = Qnil;
7572   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7573     if (NILP (exclude)
7574         || NILP (Fmemq (XCAR (tail), exclude)))
7575       {
7576         Lisp_Object attrs;
7577
7578         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7579         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7580             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7581           {
7582             ASET (attrs, coding_attr_trans_tbl,
7583                   get_translation_table (attrs, 1, NULL));
7584             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7585           }
7586       }
7587
7588   if (STRINGP (start))
7589     p = pbeg = SDATA (start);
7590   else
7591     p = pbeg = BYTE_POS_ADDR (start_byte);
7592   pend = p + (end_byte - start_byte);
7593
7594   while (p < pend && ASCII_BYTE_P (*p)) p++;
7595   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7596
7597   while (p < pend)
7598     {
7599       if (ASCII_BYTE_P (*p))
7600         p++;
7601       else
7602         {
7603           c = STRING_CHAR_ADVANCE (p);
7604
7605           charset_map_loaded = 0;
7606           for (tail = coding_attrs_list; CONSP (tail);)
7607             {
7608               elt = XCAR (tail);
7609               if (NILP (elt))
7610                 tail = XCDR (tail);
7611               else if (char_encodable_p (c, elt))
7612                 tail = XCDR (tail);
7613               else if (CONSP (XCDR (tail)))
7614                 {
7615                   XSETCAR (tail, XCAR (XCDR (tail)));
7616                   XSETCDR (tail, XCDR (XCDR (tail)));
7617                 }
7618               else
7619                 {
7620                   XSETCAR (tail, Qnil);
7621                   tail = XCDR (tail);
7622                 }
7623             }
7624           if (charset_map_loaded)
7625             {
7626               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7627
7628               if (STRINGP (start))
7629                 pbeg = SDATA (start);
7630               else
7631                 pbeg = BYTE_POS_ADDR (start_byte);
7632               p = pbeg + p_offset;
7633               pend = pbeg + pend_offset;
7634             }
7635         }
7636     }
7637
7638   safe_codings = list2 (Qraw_text, Qno_conversion);
7639   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7640     if (! NILP (XCAR (tail)))
7641       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7642
7643   return safe_codings;
7644 }
7645
7646
7647 DEFUN ("unencodable-char-position", Funencodable_char_position,
7648        Sunencodable_char_position, 3, 5, 0,
7649        doc: /*
7650 Return position of first un-encodable character in a region.
7651 START and END specfiy the region and CODING-SYSTEM specifies the
7652 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7653
7654 If optional 4th argument COUNT is non-nil, it specifies at most how
7655 many un-encodable characters to search.  In this case, the value is a
7656 list of positions.
7657
7658 If optional 5th argument STRING is non-nil, it is a string to search
7659 for un-encodable characters.  In that case, START and END are indexes
7660 to the string.  */)
7661      (start, end, coding_system, count, string)
7662      Lisp_Object start, end, coding_system, count, string;
7663 {
7664   int n;
7665   struct coding_system coding;
7666   Lisp_Object attrs, charset_list, translation_table;
7667   Lisp_Object positions;
7668   int from, to;
7669   const unsigned char *p, *stop, *pend;
7670   int ascii_compatible;
7671
7672   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7673   attrs = CODING_ID_ATTRS (coding.id);
7674   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7675     return Qnil;
7676   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7677   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7678   translation_table = get_translation_table (attrs, 1, NULL);
7679
7680   if (NILP (string))
7681     {
7682       validate_region (&start, &end);
7683       from = XINT (start);
7684       to = XINT (end);
7685       if (NILP (current_buffer->enable_multibyte_characters)
7686           || (ascii_compatible
7687               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7688         return Qnil;
7689       p = CHAR_POS_ADDR (from);
7690       pend = CHAR_POS_ADDR (to);
7691       if (from < GPT && to >= GPT)
7692         stop = GPT_ADDR;
7693       else
7694         stop = pend;
7695     }
7696   else
7697     {
7698       CHECK_STRING (string);
7699       CHECK_NATNUM (start);
7700       CHECK_NATNUM (end);
7701       from = XINT (start);
7702       to = XINT (end);
7703       if (from > to
7704           || to > SCHARS (string))
7705         args_out_of_range_3 (string, start, end);
7706       if (! STRING_MULTIBYTE (string))
7707         return Qnil;
7708       p = SDATA (string) + string_char_to_byte (string, from);
7709       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7710       if (ascii_compatible && (to - from) == (pend - p))
7711         return Qnil;
7712     }
7713
7714   if (NILP (count))
7715     n = 1;
7716   else
7717     {
7718       CHECK_NATNUM (count);
7719       n = XINT (count);
7720     }
7721
7722   positions = Qnil;
7723   while (1)
7724     {
7725       int c;
7726
7727       if (ascii_compatible)
7728         while (p < stop && ASCII_BYTE_P (*p))
7729           p++, from++;
7730       if (p >= stop)
7731         {
7732           if (p >= pend)
7733             break;
7734           stop = pend;
7735           p = GAP_END_ADDR;
7736         }
7737
7738       c = STRING_CHAR_ADVANCE (p);
7739       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7740           && ! char_charset (translate_char (translation_table, c),
7741                              charset_list, NULL))
7742         {
7743           positions = Fcons (make_number (from), positions);
7744           n--;
7745           if (n == 0)
7746             break;
7747         }
7748
7749       from++;
7750     }
7751
7752   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7753 }
7754
7755
7756 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7757        Scheck_coding_systems_region, 3, 3, 0,
7758        doc: /* Check if the region is encodable by coding systems.
7759
7760 START and END are buffer positions specifying the region.
7761 CODING-SYSTEM-LIST is a list of coding systems to check.
7762
7763 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7764 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7765 whole region, POS0, POS1, ... are buffer positions where non-encodable
7766 characters are found.
7767
7768 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7769 value is nil.
7770
7771 START may be a string.  In that case, check if the string is
7772 encodable, and the value contains indices to the string instead of
7773 buffer positions.  END is ignored.  */)
7774      (start, end, coding_system_list)
7775      Lisp_Object start, end, coding_system_list;
7776 {
7777   Lisp_Object list;
7778   EMACS_INT start_byte, end_byte;
7779   int pos;
7780   const unsigned char *p, *pbeg, *pend;
7781   int c;
7782   Lisp_Object tail, elt, attrs;
7783
7784   if (STRINGP (start))
7785     {
7786       if (!STRING_MULTIBYTE (start)
7787           && SCHARS (start) != SBYTES (start))
7788         return Qnil;
7789       start_byte = 0;
7790       end_byte = SBYTES (start);
7791       pos = 0;
7792     }
7793   else
7794     {
7795       CHECK_NUMBER_COERCE_MARKER (start);
7796       CHECK_NUMBER_COERCE_MARKER (end);
7797       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7798         args_out_of_range (start, end);
7799       if (NILP (current_buffer->enable_multibyte_characters))
7800         return Qnil;
7801       start_byte = CHAR_TO_BYTE (XINT (start));
7802       end_byte = CHAR_TO_BYTE (XINT (end));
7803       if (XINT (end) - XINT (start) == end_byte - start_byte)
7804         return Qt;
7805
7806       if (XINT (start) < GPT && XINT (end) > GPT)
7807         {
7808           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7809             move_gap_both (XINT (start), start_byte);
7810           else
7811             move_gap_both (XINT (end), end_byte);
7812         }
7813       pos = XINT (start);
7814     }
7815
7816   list = Qnil;
7817   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7818     {
7819       elt = XCAR (tail);
7820       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7821       ASET (attrs, coding_attr_trans_tbl,
7822             get_translation_table (attrs, 1, NULL));
7823       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7824     }
7825
7826   if (STRINGP (start))
7827     p = pbeg = SDATA (start);
7828   else
7829     p = pbeg = BYTE_POS_ADDR (start_byte);
7830   pend = p + (end_byte - start_byte);
7831
7832   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7833   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7834
7835   while (p < pend)
7836     {
7837       if (ASCII_BYTE_P (*p))
7838         p++;
7839       else
7840         {
7841           c = STRING_CHAR_ADVANCE (p);
7842
7843           charset_map_loaded = 0;
7844           for (tail = list; CONSP (tail); tail = XCDR (tail))
7845             {
7846               elt = XCDR (XCAR (tail));
7847               if (! char_encodable_p (c, XCAR (elt)))
7848                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7849             }
7850           if (charset_map_loaded)
7851             {
7852               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7853
7854               if (STRINGP (start))
7855                 pbeg = SDATA (start);
7856               else
7857                 pbeg = BYTE_POS_ADDR (start_byte);
7858               p = pbeg + p_offset;
7859               pend = pbeg + pend_offset;
7860             }
7861         }
7862       pos++;
7863     }
7864
7865   tail = list;
7866   list = Qnil;
7867   for (; CONSP (tail); tail = XCDR (tail))
7868     {
7869       elt = XCAR (tail);
7870       if (CONSP (XCDR (XCDR (elt))))
7871         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7872                       list);
7873     }
7874
7875   return list;
7876 }
7877
7878
7879 Lisp_Object
7880 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7881      Lisp_Object start, end, coding_system, dst_object;
7882      int encodep, norecord;
7883 {
7884   struct coding_system coding;
7885   EMACS_INT from, from_byte, to, to_byte;
7886   Lisp_Object src_object;
7887
7888   CHECK_NUMBER_COERCE_MARKER (start);
7889   CHECK_NUMBER_COERCE_MARKER (end);
7890   if (NILP (coding_system))
7891     coding_system = Qno_conversion;
7892   else
7893     CHECK_CODING_SYSTEM (coding_system);
7894   src_object = Fcurrent_buffer ();
7895   if (NILP (dst_object))
7896     dst_object = src_object;
7897   else if (! EQ (dst_object, Qt))
7898     CHECK_BUFFER (dst_object);
7899
7900   validate_region (&start, &end);
7901   from = XFASTINT (start);
7902   from_byte = CHAR_TO_BYTE (from);
7903   to = XFASTINT (end);
7904   to_byte = CHAR_TO_BYTE (to);
7905
7906   setup_coding_system (coding_system, &coding);
7907   coding.mode |= CODING_MODE_LAST_BLOCK;
7908
7909   if (encodep)
7910     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7911                           dst_object);
7912   else
7913     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7914                           dst_object);
7915   if (! norecord)
7916     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7917
7918   return (BUFFERP (dst_object)
7919           ? make_number (coding.produced_char)
7920           : coding.dst_object);
7921 }
7922
7923
7924 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7925        3, 4, "r\nzCoding system: ",
7926        doc: /* Decode the current region from the specified coding system.
7927 When called from a program, takes four arguments:
7928         START, END, CODING-SYSTEM, and DESTINATION.
7929 START and END are buffer positions.
7930
7931 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7932 If nil, the region between START and END is replace by the decoded text.
7933 If buffer, the decoded text is inserted in the buffer.
7934 If t, the decoded text is returned.
7935
7936 This function sets `last-coding-system-used' to the precise coding system
7937 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7938 not fully specified.)
7939 It returns the length of the decoded text.  */)
7940      (start, end, coding_system, destination)
7941      Lisp_Object start, end, coding_system, destination;
7942 {
7943   return code_convert_region (start, end, coding_system, destination, 0, 0);
7944 }
7945
7946 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7947        3, 4, "r\nzCoding system: ",
7948        doc: /* Encode the current region by specified coding system.
7949 When called from a program, takes three arguments:
7950 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7951
7952 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7953 If nil, the region between START and END is replace by the encoded text.
7954 If buffer, the encoded text is inserted in the buffer.
7955 If t, the encoded text is returned.
7956
7957 This function sets `last-coding-system-used' to the precise coding system
7958 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7959 not fully specified.)
7960 It returns the length of the encoded text.  */)
7961   (start, end, coding_system, destination)
7962      Lisp_Object start, end, coding_system, destination;
7963 {
7964   return code_convert_region (start, end, coding_system, destination, 1, 0);
7965 }
7966
7967 Lisp_Object
7968 code_convert_string (string, coding_system, dst_object,
7969                      encodep, nocopy, norecord)
7970      Lisp_Object string, coding_system, dst_object;
7971      int encodep, nocopy, norecord;
7972 {
7973   struct coding_system coding;
7974   EMACS_INT chars, bytes;
7975
7976   CHECK_STRING (string);
7977   if (NILP (coding_system))
7978     {
7979       if (! norecord)
7980         Vlast_coding_system_used = Qno_conversion;
7981       if (NILP (dst_object))
7982         return (nocopy ? Fcopy_sequence (string) : string);
7983     }
7984
7985   if (NILP (coding_system))
7986     coding_system = Qno_conversion;
7987   else
7988     CHECK_CODING_SYSTEM (coding_system);
7989   if (NILP (dst_object))
7990     dst_object = Qt;
7991   else if (! EQ (dst_object, Qt))
7992     CHECK_BUFFER (dst_object);
7993
7994   setup_coding_system (coding_system, &coding);
7995   coding.mode |= CODING_MODE_LAST_BLOCK;
7996   chars = SCHARS (string);
7997   bytes = SBYTES (string);
7998   if (encodep)
7999     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8000   else
8001     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
8002   if (! norecord)
8003     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
8004
8005   return (BUFFERP (dst_object)
8006           ? make_number (coding.produced_char)
8007           : coding.dst_object);
8008 }
8009
8010
8011 /* Encode or decode STRING according to CODING_SYSTEM.
8012    Do not set Vlast_coding_system_used.
8013
8014    This function is called only from macros DECODE_FILE and
8015    ENCODE_FILE, thus we ignore character composition.  */
8016
8017 Lisp_Object
8018 code_convert_string_norecord (string, coding_system, encodep)
8019      Lisp_Object string, coding_system;
8020      int encodep;
8021 {
8022   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
8023 }
8024
8025
8026 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
8027        2, 4, 0,
8028        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
8029
8030 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
8031 if the decoding operation is trivial.
8032
8033 Optional fourth arg BUFFER non-nil meant that the decoded text is
8034 inserted in BUFFER instead of returned as a string.  In this case,
8035 the return value is BUFFER.
8036
8037 This function sets `last-coding-system-used' to the precise coding system
8038 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8039 not fully specified.  */)
8040   (string, coding_system, nocopy, buffer)
8041      Lisp_Object string, coding_system, nocopy, buffer;
8042 {
8043   return code_convert_string (string, coding_system, buffer,
8044                               0, ! NILP (nocopy), 0);
8045 }
8046
8047 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8048        2, 4, 0,
8049        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8050
8051 Optional third arg NOCOPY non-nil means it is OK to return STRING
8052 itself if the encoding operation is trivial.
8053
8054 Optional fourth arg BUFFER non-nil meant that the encoded text is
8055 inserted in BUFFER instead of returned as a string.  In this case,
8056 the return value is BUFFER.
8057
8058 This function sets `last-coding-system-used' to the precise coding system
8059 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8060 not fully specified.)  */)
8061      (string, coding_system, nocopy, buffer)
8062      Lisp_Object string, coding_system, nocopy, buffer;
8063 {
8064   return code_convert_string (string, coding_system, buffer,
8065                               1, ! NILP (nocopy), 1);
8066 }
8067
8068 \f
8069 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8070        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8071 Return the corresponding character.  */)
8072      (code)
8073      Lisp_Object code;
8074 {
8075   Lisp_Object spec, attrs, val;
8076   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8077   int c;
8078
8079   CHECK_NATNUM (code);
8080   c = XFASTINT (code);
8081   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8082   attrs = AREF (spec, 0);
8083
8084   if (ASCII_BYTE_P (c)
8085       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8086     return code;
8087
8088   val = CODING_ATTR_CHARSET_LIST (attrs);
8089   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8090   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8091   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8092
8093   if (c <= 0x7F)
8094     charset = charset_roman;
8095   else if (c >= 0xA0 && c < 0xDF)
8096     {
8097       charset = charset_kana;
8098       c -= 0x80;
8099     }
8100   else
8101     {
8102       int s1 = c >> 8, s2 = c & 0xFF;
8103
8104       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8105           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8106         error ("Invalid code: %d", code);
8107       SJIS_TO_JIS (c);
8108       charset = charset_kanji;
8109     }
8110   c = DECODE_CHAR (charset, c);
8111   if (c < 0)
8112     error ("Invalid code: %d", code);
8113   return make_number (c);
8114 }
8115
8116
8117 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8118        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8119 Return the corresponding code in SJIS.  */)
8120      (ch)
8121     Lisp_Object ch;
8122 {
8123   Lisp_Object spec, attrs, charset_list;
8124   int c;
8125   struct charset *charset;
8126   unsigned code;
8127
8128   CHECK_CHARACTER (ch);
8129   c = XFASTINT (ch);
8130   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8131   attrs = AREF (spec, 0);
8132
8133   if (ASCII_CHAR_P (c)
8134       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8135     return ch;
8136
8137   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8138   charset = char_charset (c, charset_list, &code);
8139   if (code == CHARSET_INVALID_CODE (charset))
8140     error ("Can't encode by shift_jis encoding: %d", c);
8141   JIS_TO_SJIS (code);
8142
8143   return make_number (code);
8144 }
8145
8146 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8147        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8148 Return the corresponding character.  */)
8149      (code)
8150      Lisp_Object code;
8151 {
8152   Lisp_Object spec, attrs, val;
8153   struct charset *charset_roman, *charset_big5, *charset;
8154   int c;
8155
8156   CHECK_NATNUM (code);
8157   c = XFASTINT (code);
8158   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8159   attrs = AREF (spec, 0);
8160
8161   if (ASCII_BYTE_P (c)
8162       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8163     return code;
8164
8165   val = CODING_ATTR_CHARSET_LIST (attrs);
8166   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8167   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8168
8169   if (c <= 0x7F)
8170     charset = charset_roman;
8171   else
8172     {
8173       int b1 = c >> 8, b2 = c & 0x7F;
8174       if (b1 < 0xA1 || b1 > 0xFE
8175           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8176         error ("Invalid code: %d", code);
8177       charset = charset_big5;
8178     }
8179   c = DECODE_CHAR (charset, (unsigned )c);
8180   if (c < 0)
8181     error ("Invalid code: %d", code);
8182   return make_number (c);
8183 }
8184
8185 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8186        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8187 Return the corresponding character code in Big5.  */)
8188      (ch)
8189      Lisp_Object ch;
8190 {
8191   Lisp_Object spec, attrs, charset_list;
8192   struct charset *charset;
8193   int c;
8194   unsigned code;
8195
8196   CHECK_CHARACTER (ch);
8197   c = XFASTINT (ch);
8198   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8199   attrs = AREF (spec, 0);
8200   if (ASCII_CHAR_P (c)
8201       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8202     return ch;
8203
8204   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8205   charset = char_charset (c, charset_list, &code);
8206   if (code == CHARSET_INVALID_CODE (charset))
8207     error ("Can't encode by Big5 encoding: %d", c);
8208
8209   return make_number (code);
8210 }
8211
8212 \f
8213 DEFUN ("set-terminal-coding-system-internal",
8214        Fset_terminal_coding_system_internal,
8215        Sset_terminal_coding_system_internal, 1, 1, 0,
8216        doc: /* Internal use only.  */)
8217      (coding_system)
8218      Lisp_Object coding_system;
8219 {
8220   CHECK_SYMBOL (coding_system);
8221   setup_coding_system (Fcheck_coding_system (coding_system),
8222                         &terminal_coding);
8223
8224   /* We had better not send unsafe characters to terminal.  */
8225   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8226   /* Characer composition should be disabled.  */
8227   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8228   terminal_coding.src_multibyte = 1;
8229   terminal_coding.dst_multibyte = 0;
8230   return Qnil;
8231 }
8232
8233 DEFUN ("set-safe-terminal-coding-system-internal",
8234        Fset_safe_terminal_coding_system_internal,
8235        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8236        doc: /* Internal use only.  */)
8237      (coding_system)
8238      Lisp_Object coding_system;
8239 {
8240   CHECK_SYMBOL (coding_system);
8241   setup_coding_system (Fcheck_coding_system (coding_system),
8242                        &safe_terminal_coding);
8243   /* Characer composition should be disabled.  */
8244   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8245   safe_terminal_coding.src_multibyte = 1;
8246   safe_terminal_coding.dst_multibyte = 0;
8247   return Qnil;
8248 }
8249
8250 DEFUN ("terminal-coding-system",
8251        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8252        doc: /* Return coding system specified for terminal output.  */)
8253      ()
8254 {
8255   Lisp_Object coding_system;
8256
8257   coding_system = CODING_ID_NAME (terminal_coding.id);
8258   /* For backward compatibility, return nil if it is `undecided'. */
8259   return (! EQ (coding_system, Qundecided) ? coding_system : Qnil);
8260 }
8261
8262 DEFUN ("set-keyboard-coding-system-internal",
8263        Fset_keyboard_coding_system_internal,
8264        Sset_keyboard_coding_system_internal, 1, 1, 0,
8265        doc: /* Internal use only.  */)
8266      (coding_system)
8267      Lisp_Object coding_system;
8268 {
8269   CHECK_SYMBOL (coding_system);
8270   setup_coding_system (Fcheck_coding_system (coding_system),
8271                        &keyboard_coding);
8272   /* Characer composition should be disabled.  */
8273   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8274   return Qnil;
8275 }
8276
8277 DEFUN ("keyboard-coding-system",
8278        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8279        doc: /* Return coding system specified for decoding keyboard input.  */)
8280      ()
8281 {
8282   return CODING_ID_NAME (keyboard_coding.id);
8283 }
8284
8285 \f
8286 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8287        Sfind_operation_coding_system,  1, MANY, 0,
8288        doc: /* Choose a coding system for an operation based on the target name.
8289 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8290 DECODING-SYSTEM is the coding system to use for decoding
8291 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8292 for encoding (in case OPERATION does encoding).
8293
8294 The first argument OPERATION specifies an I/O primitive:
8295   For file I/O, `insert-file-contents' or `write-region'.
8296   For process I/O, `call-process', `call-process-region', or `start-process'.
8297   For network I/O, `open-network-stream'.
8298
8299 The remaining arguments should be the same arguments that were passed
8300 to the primitive.  Depending on which primitive, one of those arguments
8301 is selected as the TARGET.  For example, if OPERATION does file I/O,
8302 whichever argument specifies the file name is TARGET.
8303
8304 TARGET has a meaning which depends on OPERATION:
8305   For file I/O, TARGET is a file name.
8306   For process I/O, TARGET is a process name.
8307   For network I/O, TARGET is a service name or a port number
8308
8309 This function looks up what specified for TARGET in,
8310 `file-coding-system-alist', `process-coding-system-alist',
8311 or `network-coding-system-alist' depending on OPERATION.
8312 They may specify a coding system, a cons of coding systems,
8313 or a function symbol to call.
8314 In the last case, we call the function with one argument,
8315 which is a list of all the arguments given to this function.
8316
8317 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8318      (nargs, args)
8319      int nargs;
8320      Lisp_Object *args;
8321 {
8322   Lisp_Object operation, target_idx, target, val;
8323   register Lisp_Object chain;
8324
8325   if (nargs < 2)
8326     error ("Too few arguments");
8327   operation = args[0];
8328   if (!SYMBOLP (operation)
8329       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8330     error ("Invalid first arguement");
8331   if (nargs < 1 + XINT (target_idx))
8332     error ("Too few arguments for operation: %s",
8333            SDATA (SYMBOL_NAME (operation)));
8334   target = args[XINT (target_idx) + 1];
8335   if (!(STRINGP (target)
8336         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8337     error ("Invalid %dth argument", XINT (target_idx) + 1);
8338
8339   chain = ((EQ (operation, Qinsert_file_contents)
8340             || EQ (operation, Qwrite_region))
8341            ? Vfile_coding_system_alist
8342            : (EQ (operation, Qopen_network_stream)
8343               ? Vnetwork_coding_system_alist
8344               : Vprocess_coding_system_alist));
8345   if (NILP (chain))
8346     return Qnil;
8347
8348   for (; CONSP (chain); chain = XCDR (chain))
8349     {
8350       Lisp_Object elt;
8351
8352       elt = XCAR (chain);
8353       if (CONSP (elt)
8354           && ((STRINGP (target)
8355                && STRINGP (XCAR (elt))
8356                && fast_string_match (XCAR (elt), target) >= 0)
8357               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8358         {
8359           val = XCDR (elt);
8360           /* Here, if VAL is both a valid coding system and a valid
8361              function symbol, we return VAL as a coding system.  */
8362           if (CONSP (val))
8363             return val;
8364           if (! SYMBOLP (val))
8365             return Qnil;
8366           if (! NILP (Fcoding_system_p (val)))
8367             return Fcons (val, val);
8368           if (! NILP (Ffboundp (val)))
8369             {
8370               val = call1 (val, Flist (nargs, args));
8371               if (CONSP (val))
8372                 return val;
8373               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8374                 return Fcons (val, val);
8375             }
8376           return Qnil;
8377         }
8378     }
8379   return Qnil;
8380 }
8381
8382 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8383        Sset_coding_system_priority, 0, MANY, 0,
8384        doc: /* Assign higher priority to the coding systems given as arguments.
8385 If multiple coding systems belongs to the same category,
8386 all but the first one are ignored.
8387
8388 usage: (set-coding-system-priority ...)  */)
8389      (nargs, args)
8390      int nargs;
8391      Lisp_Object *args;
8392 {
8393   int i, j;
8394   int changed[coding_category_max];
8395   enum coding_category priorities[coding_category_max];
8396
8397   bzero (changed, sizeof changed);
8398
8399   for (i = j = 0; i < nargs; i++)
8400     {
8401       enum coding_category category;
8402       Lisp_Object spec, attrs;
8403
8404       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8405       attrs = AREF (spec, 0);
8406       category = XINT (CODING_ATTR_CATEGORY (attrs));
8407       if (changed[category])
8408         /* Ignore this coding system because a coding system of the
8409            same category already had a higher priority.  */
8410         continue;
8411       changed[category] = 1;
8412       priorities[j++] = category;
8413       if (coding_categories[category].id >= 0
8414           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8415         setup_coding_system (args[i], &coding_categories[category]);
8416       Fset (AREF (Vcoding_category_table, category), args[i]);
8417     }
8418
8419   /* Now we have decided top J priorities.  Reflect the order of the
8420      original priorities to the remaining priorities.  */
8421
8422   for (i = j, j = 0; i < coding_category_max; i++, j++)
8423     {
8424       while (j < coding_category_max
8425              && changed[coding_priorities[j]])
8426         j++;
8427       if (j == coding_category_max)
8428         abort ();
8429       priorities[i] = coding_priorities[j];
8430     }
8431
8432   bcopy (priorities, coding_priorities, sizeof priorities);
8433
8434   /* Update `coding-category-list'.  */
8435   Vcoding_category_list = Qnil;
8436   for (i = coding_category_max - 1; i >= 0; i--)
8437     Vcoding_category_list
8438       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8439                Vcoding_category_list);
8440
8441   return Qnil;
8442 }
8443
8444 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8445        Scoding_system_priority_list, 0, 1, 0,
8446        doc: /* Return a list of coding systems ordered by their priorities.
8447 HIGHESTP non-nil means just return the highest priority one.  */)
8448      (highestp)
8449      Lisp_Object highestp;
8450 {
8451   int i;
8452   Lisp_Object val;
8453
8454   for (i = 0, val = Qnil; i < coding_category_max; i++)
8455     {
8456       enum coding_category category = coding_priorities[i];
8457       int id = coding_categories[category].id;
8458       Lisp_Object attrs;
8459
8460       if (id < 0)
8461         continue;
8462       attrs = CODING_ID_ATTRS (id);
8463       if (! NILP (highestp))
8464         return CODING_ATTR_BASE_NAME (attrs);
8465       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8466     }
8467   return Fnreverse (val);
8468 }
8469
8470 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8471
8472 static Lisp_Object
8473 make_subsidiaries (base)
8474      Lisp_Object base;
8475 {
8476   Lisp_Object subsidiaries;
8477   int base_name_len = SBYTES (SYMBOL_NAME (base));
8478   char *buf = (char *) alloca (base_name_len + 6);
8479   int i;
8480
8481   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8482   subsidiaries = Fmake_vector (make_number (3), Qnil);
8483   for (i = 0; i < 3; i++)
8484     {
8485       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8486       ASET (subsidiaries, i, intern (buf));
8487     }
8488   return subsidiaries;
8489 }
8490
8491
8492 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8493        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8494        doc: /* For internal use only.
8495 usage: (define-coding-system-internal ...)  */)
8496      (nargs, args)
8497      int nargs;
8498      Lisp_Object *args;
8499 {
8500   Lisp_Object name;
8501   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8502   Lisp_Object attrs;            /* Vector of attributes.  */
8503   Lisp_Object eol_type;
8504   Lisp_Object aliases;
8505   Lisp_Object coding_type, charset_list, safe_charsets;
8506   enum coding_category category;
8507   Lisp_Object tail, val;
8508   int max_charset_id = 0;
8509   int i;
8510
8511   if (nargs < coding_arg_max)
8512     goto short_args;
8513
8514   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8515
8516   name = args[coding_arg_name];
8517   CHECK_SYMBOL (name);
8518   CODING_ATTR_BASE_NAME (attrs) = name;
8519
8520   val = args[coding_arg_mnemonic];
8521   if (! STRINGP (val))
8522     CHECK_CHARACTER (val);
8523   CODING_ATTR_MNEMONIC (attrs) = val;
8524
8525   coding_type = args[coding_arg_coding_type];
8526   CHECK_SYMBOL (coding_type);
8527   CODING_ATTR_TYPE (attrs) = coding_type;
8528
8529   charset_list = args[coding_arg_charset_list];
8530   if (SYMBOLP (charset_list))
8531     {
8532       if (EQ (charset_list, Qiso_2022))
8533         {
8534           if (! EQ (coding_type, Qiso_2022))
8535             error ("Invalid charset-list");
8536           charset_list = Viso_2022_charset_list;
8537         }
8538       else if (EQ (charset_list, Qemacs_mule))
8539         {
8540           if (! EQ (coding_type, Qemacs_mule))
8541             error ("Invalid charset-list");
8542           charset_list = Vemacs_mule_charset_list;
8543         }
8544       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8545         if (max_charset_id < XFASTINT (XCAR (tail)))
8546           max_charset_id = XFASTINT (XCAR (tail));
8547     }
8548   else
8549     {
8550       charset_list = Fcopy_sequence (charset_list);
8551       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8552         {
8553           struct charset *charset;
8554
8555           val = Fcar (tail);
8556           CHECK_CHARSET_GET_CHARSET (val, charset);
8557           if (EQ (coding_type, Qiso_2022)
8558               ? CHARSET_ISO_FINAL (charset) < 0
8559               : EQ (coding_type, Qemacs_mule)
8560               ? CHARSET_EMACS_MULE_ID (charset) < 0
8561               : 0)
8562             error ("Can't handle charset `%s'",
8563                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8564
8565           XSETCAR (tail, make_number (charset->id));
8566           if (max_charset_id < charset->id)
8567             max_charset_id = charset->id;
8568         }
8569     }
8570   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8571
8572   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8573                                 make_number (255));
8574   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8575     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8576   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8577
8578   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8579
8580   val = args[coding_arg_decode_translation_table];
8581   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8582     CHECK_SYMBOL (val);
8583   CODING_ATTR_DECODE_TBL (attrs) = val;
8584
8585   val = args[coding_arg_encode_translation_table];
8586   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8587     CHECK_SYMBOL (val);
8588   CODING_ATTR_ENCODE_TBL (attrs) = val;
8589
8590   val = args[coding_arg_post_read_conversion];
8591   CHECK_SYMBOL (val);
8592   CODING_ATTR_POST_READ (attrs) = val;
8593
8594   val = args[coding_arg_pre_write_conversion];
8595   CHECK_SYMBOL (val);
8596   CODING_ATTR_PRE_WRITE (attrs) = val;
8597
8598   val = args[coding_arg_default_char];
8599   if (NILP (val))
8600     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8601   else
8602     {
8603       CHECK_CHARACTER (val);
8604       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8605     }
8606
8607   val = args[coding_arg_for_unibyte];
8608   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8609
8610   val = args[coding_arg_plist];
8611   CHECK_LIST (val);
8612   CODING_ATTR_PLIST (attrs) = val;
8613
8614   if (EQ (coding_type, Qcharset))
8615     {
8616       /* Generate a lisp vector of 256 elements.  Each element is nil,
8617          integer, or a list of charset IDs.
8618
8619          If Nth element is nil, the byte code N is invalid in this
8620          coding system.
8621
8622          If Nth element is a number NUM, N is the first byte of a
8623          charset whose ID is NUM.
8624
8625          If Nth element is a list of charset IDs, N is the first byte
8626          of one of them.  The list is sorted by dimensions of the
8627          charsets.  A charset of smaller dimension comes firtst. */
8628       val = Fmake_vector (make_number (256), Qnil);
8629
8630       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8631         {
8632           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8633           int dim = CHARSET_DIMENSION (charset);
8634           int idx = (dim - 1) * 4;
8635
8636           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8637             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8638
8639           for (i = charset->code_space[idx];
8640                i <= charset->code_space[idx + 1]; i++)
8641             {
8642               Lisp_Object tmp, tmp2;
8643               int dim2;
8644
8645               tmp = AREF (val, i);
8646               if (NILP (tmp))
8647                 tmp = XCAR (tail);
8648               else if (NUMBERP (tmp))
8649                 {
8650                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8651                   if (dim < dim2)
8652                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8653                   else
8654                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8655                 }
8656               else
8657                 {
8658                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8659                     {
8660                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8661                       if (dim < dim2)
8662                         break;
8663                     }
8664                   if (NILP (tmp2))
8665                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8666                   else
8667                     {
8668                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8669                       XSETCAR (tmp2, XCAR (tail));
8670                     }
8671                 }
8672               ASET (val, i, tmp);
8673             }
8674         }
8675       ASET (attrs, coding_attr_charset_valids, val);
8676       category = coding_category_charset;
8677     }
8678   else if (EQ (coding_type, Qccl))
8679     {
8680       Lisp_Object valids;
8681
8682       if (nargs < coding_arg_ccl_max)
8683         goto short_args;
8684
8685       val = args[coding_arg_ccl_decoder];
8686       CHECK_CCL_PROGRAM (val);
8687       if (VECTORP (val))
8688         val = Fcopy_sequence (val);
8689       ASET (attrs, coding_attr_ccl_decoder, val);
8690
8691       val = args[coding_arg_ccl_encoder];
8692       CHECK_CCL_PROGRAM (val);
8693       if (VECTORP (val))
8694         val = Fcopy_sequence (val);
8695       ASET (attrs, coding_attr_ccl_encoder, val);
8696
8697       val = args[coding_arg_ccl_valids];
8698       valids = Fmake_string (make_number (256), make_number (0));
8699       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8700         {
8701           int from, to;
8702
8703           val = Fcar (tail);
8704           if (INTEGERP (val))
8705             {
8706               from = to = XINT (val);
8707               if (from < 0 || from > 255)
8708                 args_out_of_range_3 (val, make_number (0), make_number (255));
8709             }
8710           else
8711             {
8712               CHECK_CONS (val);
8713               CHECK_NATNUM_CAR (val);
8714               CHECK_NATNUM_CDR (val);
8715               from = XINT (XCAR (val));
8716               if (from > 255)
8717                 args_out_of_range_3 (XCAR (val),
8718                                      make_number (0), make_number (255));
8719               to = XINT (XCDR (val));
8720               if (to < from || to > 255)
8721                 args_out_of_range_3 (XCDR (val),
8722                                      XCAR (val), make_number (255));
8723             }
8724           for (i = from; i <= to; i++)
8725             SSET (valids, i, 1);
8726         }
8727       ASET (attrs, coding_attr_ccl_valids, valids);
8728
8729       category = coding_category_ccl;
8730     }
8731   else if (EQ (coding_type, Qutf_16))
8732     {
8733       Lisp_Object bom, endian;
8734
8735       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8736
8737       if (nargs < coding_arg_utf16_max)
8738         goto short_args;
8739
8740       bom = args[coding_arg_utf16_bom];
8741       if (! NILP (bom) && ! EQ (bom, Qt))
8742         {
8743           CHECK_CONS (bom);
8744           val = XCAR (bom);
8745           CHECK_CODING_SYSTEM (val);
8746           val = XCDR (bom);
8747           CHECK_CODING_SYSTEM (val);
8748         }
8749       ASET (attrs, coding_attr_utf_16_bom, bom);
8750
8751       endian = args[coding_arg_utf16_endian];
8752       CHECK_SYMBOL (endian);
8753       if (NILP (endian))
8754         endian = Qbig;
8755       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8756         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8757       ASET (attrs, coding_attr_utf_16_endian, endian);
8758
8759       category = (CONSP (bom)
8760                   ? coding_category_utf_16_auto
8761                   : NILP (bom)
8762                   ? (EQ (endian, Qbig)
8763                      ? coding_category_utf_16_be_nosig
8764                      : coding_category_utf_16_le_nosig)
8765                   : (EQ (endian, Qbig)
8766                      ? coding_category_utf_16_be
8767                      : coding_category_utf_16_le));
8768     }
8769   else if (EQ (coding_type, Qiso_2022))
8770     {
8771       Lisp_Object initial, reg_usage, request, flags;
8772       int i;
8773
8774       if (nargs < coding_arg_iso2022_max)
8775         goto short_args;
8776
8777       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8778       CHECK_VECTOR (initial);
8779       for (i = 0; i < 4; i++)
8780         {
8781           val = Faref (initial, make_number (i));
8782           if (! NILP (val))
8783             {
8784               struct charset *charset;
8785
8786               CHECK_CHARSET_GET_CHARSET (val, charset);
8787               ASET (initial, i, make_number (CHARSET_ID (charset)));
8788               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8789                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8790             }
8791           else
8792             ASET (initial, i, make_number (-1));
8793         }
8794
8795       reg_usage = args[coding_arg_iso2022_reg_usage];
8796       CHECK_CONS (reg_usage);
8797       CHECK_NUMBER_CAR (reg_usage);
8798       CHECK_NUMBER_CDR (reg_usage);
8799
8800       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8801       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8802         {
8803           int id;
8804           Lisp_Object tmp;
8805
8806           val = Fcar (tail);
8807           CHECK_CONS (val);
8808           tmp = XCAR (val);
8809           CHECK_CHARSET_GET_ID (tmp, id);
8810           CHECK_NATNUM_CDR (val);
8811           if (XINT (XCDR (val)) >= 4)
8812             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8813           XSETCAR (val, make_number (id));
8814         }
8815
8816       flags = args[coding_arg_iso2022_flags];
8817       CHECK_NATNUM (flags);
8818       i = XINT (flags);
8819       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8820         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8821
8822       ASET (attrs, coding_attr_iso_initial, initial);
8823       ASET (attrs, coding_attr_iso_usage, reg_usage);
8824       ASET (attrs, coding_attr_iso_request, request);
8825       ASET (attrs, coding_attr_iso_flags, flags);
8826       setup_iso_safe_charsets (attrs);
8827
8828       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8829         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8830                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8831                     ? coding_category_iso_7_else
8832                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8833                     ? coding_category_iso_7
8834                     : coding_category_iso_7_tight);
8835       else
8836         {
8837           int id = XINT (AREF (initial, 1));
8838
8839           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8840                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8841                        || id < 0)
8842                       ? coding_category_iso_8_else
8843                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8844                       ? coding_category_iso_8_1
8845                       : coding_category_iso_8_2);
8846         }
8847       if (category != coding_category_iso_8_1
8848           && category != coding_category_iso_8_2)
8849         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8850     }
8851   else if (EQ (coding_type, Qemacs_mule))
8852     {
8853       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8854         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8855       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8856       category = coding_category_emacs_mule;
8857     }
8858   else if (EQ (coding_type, Qshift_jis))
8859     {
8860
8861       struct charset *charset;
8862
8863       if (XINT (Flength (charset_list)) != 3
8864           && XINT (Flength (charset_list)) != 4)
8865         error ("There should be three or four charsets");
8866
8867       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8868       if (CHARSET_DIMENSION (charset) != 1)
8869         error ("Dimension of charset %s is not one",
8870                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8871       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8872         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8873
8874       charset_list = XCDR (charset_list);
8875       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8876       if (CHARSET_DIMENSION (charset) != 1)
8877         error ("Dimension of charset %s is not one",
8878                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8879
8880       charset_list = XCDR (charset_list);
8881       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8882       if (CHARSET_DIMENSION (charset) != 2)
8883         error ("Dimension of charset %s is not two",
8884                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8885
8886       charset_list = XCDR (charset_list);
8887       if (! NILP (charset_list))
8888         {
8889           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8890           if (CHARSET_DIMENSION (charset) != 2)
8891             error ("Dimension of charset %s is not two",
8892                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8893         }
8894
8895       category = coding_category_sjis;
8896       Vsjis_coding_system = name;
8897     }
8898   else if (EQ (coding_type, Qbig5))
8899     {
8900       struct charset *charset;
8901
8902       if (XINT (Flength (charset_list)) != 2)
8903         error ("There should be just two charsets");
8904
8905       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8906       if (CHARSET_DIMENSION (charset) != 1)
8907         error ("Dimension of charset %s is not one",
8908                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8909       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8910         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8911
8912       charset_list = XCDR (charset_list);
8913       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8914       if (CHARSET_DIMENSION (charset) != 2)
8915         error ("Dimension of charset %s is not two",
8916                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8917
8918       category = coding_category_big5;
8919       Vbig5_coding_system = name;
8920     }
8921   else if (EQ (coding_type, Qraw_text))
8922     {
8923       category = coding_category_raw_text;
8924       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8925     }
8926   else if (EQ (coding_type, Qutf_8))
8927     {
8928       category = coding_category_utf_8;
8929       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8930     }
8931   else if (EQ (coding_type, Qundecided))
8932     category = coding_category_undecided;
8933   else
8934     error ("Invalid coding system type: %s",
8935            SDATA (SYMBOL_NAME (coding_type)));
8936
8937   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8938   CODING_ATTR_PLIST (attrs)
8939     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8940                                 CODING_ATTR_PLIST (attrs)));
8941   CODING_ATTR_PLIST (attrs)
8942     = Fcons (QCascii_compatible_p,
8943              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8944                     CODING_ATTR_PLIST (attrs)));
8945
8946   eol_type = args[coding_arg_eol_type];
8947   if (! NILP (eol_type)
8948       && ! EQ (eol_type, Qunix)
8949       && ! EQ (eol_type, Qdos)
8950       && ! EQ (eol_type, Qmac))
8951     error ("Invalid eol-type");
8952
8953   aliases = Fcons (name, Qnil);
8954
8955   if (NILP (eol_type))
8956     {
8957       eol_type = make_subsidiaries (name);
8958       for (i = 0; i < 3; i++)
8959         {
8960           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8961
8962           this_name = AREF (eol_type, i);
8963           this_aliases = Fcons (this_name, Qnil);
8964           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8965           this_spec = Fmake_vector (make_number (3), attrs);
8966           ASET (this_spec, 1, this_aliases);
8967           ASET (this_spec, 2, this_eol_type);
8968           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8969           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8970           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8971           if (NILP (val))
8972             Vcoding_system_alist
8973               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8974                        Vcoding_system_alist);
8975         }
8976     }
8977
8978   spec_vec = Fmake_vector (make_number (3), attrs);
8979   ASET (spec_vec, 1, aliases);
8980   ASET (spec_vec, 2, eol_type);
8981
8982   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8983   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8984   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8985   if (NILP (val))
8986     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8987                                   Vcoding_system_alist);
8988
8989   {
8990     int id = coding_categories[category].id;
8991
8992     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8993       setup_coding_system (name, &coding_categories[category]);
8994   }
8995
8996   return Qnil;
8997
8998  short_args:
8999   return Fsignal (Qwrong_number_of_arguments,
9000                   Fcons (intern ("define-coding-system-internal"),
9001                          make_number (nargs)));
9002 }
9003
9004
9005 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
9006        3, 3, 0,
9007        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
9008   (coding_system, prop, val)
9009      Lisp_Object coding_system, prop, val;
9010 {
9011   Lisp_Object spec, attrs;
9012
9013   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9014   attrs = AREF (spec, 0);
9015   if (EQ (prop, QCmnemonic))
9016     {
9017       if (! STRINGP (val))
9018         CHECK_CHARACTER (val);
9019       CODING_ATTR_MNEMONIC (attrs) = val;
9020     }
9021   else if (EQ (prop, QCdefalut_char))
9022     {
9023       if (NILP (val))
9024         val = make_number (' ');
9025       else
9026         CHECK_CHARACTER (val);
9027       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
9028     }
9029   else if (EQ (prop, QCdecode_translation_table))
9030     {
9031       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9032         CHECK_SYMBOL (val);
9033       CODING_ATTR_DECODE_TBL (attrs) = val;
9034     }
9035   else if (EQ (prop, QCencode_translation_table))
9036     {
9037       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9038         CHECK_SYMBOL (val);
9039       CODING_ATTR_ENCODE_TBL (attrs) = val;
9040     }
9041   else if (EQ (prop, QCpost_read_conversion))
9042     {
9043       CHECK_SYMBOL (val);
9044       CODING_ATTR_POST_READ (attrs) = val;
9045     }
9046   else if (EQ (prop, QCpre_write_conversion))
9047     {
9048       CHECK_SYMBOL (val);
9049       CODING_ATTR_PRE_WRITE (attrs) = val;
9050     }
9051   else if (EQ (prop, QCascii_compatible_p))
9052     {
9053       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9054     }
9055
9056   CODING_ATTR_PLIST (attrs)
9057     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9058   return val;
9059 }
9060
9061
9062 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9063        Sdefine_coding_system_alias, 2, 2, 0,
9064        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9065      (alias, coding_system)
9066      Lisp_Object alias, coding_system;
9067 {
9068   Lisp_Object spec, aliases, eol_type, val;
9069
9070   CHECK_SYMBOL (alias);
9071   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9072   aliases = AREF (spec, 1);
9073   /* ALISES should be a list of length more than zero, and the first
9074      element is a base coding system.  Append ALIAS at the tail of the
9075      list.  */
9076   while (!NILP (XCDR (aliases)))
9077     aliases = XCDR (aliases);
9078   XSETCDR (aliases, Fcons (alias, Qnil));
9079
9080   eol_type = AREF (spec, 2);
9081   if (VECTORP (eol_type))
9082     {
9083       Lisp_Object subsidiaries;
9084       int i;
9085
9086       subsidiaries = make_subsidiaries (alias);
9087       for (i = 0; i < 3; i++)
9088         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9089                                      AREF (eol_type, i));
9090     }
9091
9092   Fputhash (alias, spec, Vcoding_system_hash_table);
9093   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9094   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9095   if (NILP (val))
9096     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9097                                   Vcoding_system_alist);
9098
9099   return Qnil;
9100 }
9101
9102 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9103        1, 1, 0,
9104        doc: /* Return the base of CODING-SYSTEM.
9105 Any alias or subsidiary coding system is not a base coding system.  */)
9106   (coding_system)
9107      Lisp_Object coding_system;
9108 {
9109   Lisp_Object spec, attrs;
9110
9111   if (NILP (coding_system))
9112     return (Qno_conversion);
9113   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9114   attrs = AREF (spec, 0);
9115   return CODING_ATTR_BASE_NAME (attrs);
9116 }
9117
9118 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9119        1, 1, 0,
9120        doc: "Return the property list of CODING-SYSTEM.")
9121      (coding_system)
9122      Lisp_Object coding_system;
9123 {
9124   Lisp_Object spec, attrs;
9125
9126   if (NILP (coding_system))
9127     coding_system = Qno_conversion;
9128   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9129   attrs = AREF (spec, 0);
9130   return CODING_ATTR_PLIST (attrs);
9131 }
9132
9133
9134 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9135        1, 1, 0,
9136        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9137      (coding_system)
9138      Lisp_Object coding_system;
9139 {
9140   Lisp_Object spec;
9141
9142   if (NILP (coding_system))
9143     coding_system = Qno_conversion;
9144   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9145   return AREF (spec, 1);
9146 }
9147
9148 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9149        Scoding_system_eol_type, 1, 1, 0,
9150        doc: /* Return eol-type of CODING-SYSTEM.
9151 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9152
9153 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9154 and CR respectively.
9155
9156 A vector value indicates that a format of end-of-line should be
9157 detected automatically.  Nth element of the vector is the subsidiary
9158 coding system whose eol-type is N.  */)
9159      (coding_system)
9160      Lisp_Object coding_system;
9161 {
9162   Lisp_Object spec, eol_type;
9163   int n;
9164
9165   if (NILP (coding_system))
9166     coding_system = Qno_conversion;
9167   if (! CODING_SYSTEM_P (coding_system))
9168     return Qnil;
9169   spec = CODING_SYSTEM_SPEC (coding_system);
9170   eol_type = AREF (spec, 2);
9171   if (VECTORP (eol_type))
9172     return Fcopy_sequence (eol_type);
9173   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9174   return make_number (n);
9175 }
9176
9177 #endif /* emacs */
9178
9179 \f
9180 /*** 9. Post-amble ***/
9181
9182 void
9183 init_coding_once ()
9184 {
9185   int i;
9186
9187   for (i = 0; i < coding_category_max; i++)
9188     {
9189       coding_categories[i].id = -1;
9190       coding_priorities[i] = i;
9191     }
9192
9193   /* ISO2022 specific initialize routine.  */
9194   for (i = 0; i < 0x20; i++)
9195     iso_code_class[i] = ISO_control_0;
9196   for (i = 0x21; i < 0x7F; i++)
9197     iso_code_class[i] = ISO_graphic_plane_0;
9198   for (i = 0x80; i < 0xA0; i++)
9199     iso_code_class[i] = ISO_control_1;
9200   for (i = 0xA1; i < 0xFF; i++)
9201     iso_code_class[i] = ISO_graphic_plane_1;
9202   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9203   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9204   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9205   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9206   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9207   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9208   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9209   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9210   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9211
9212   for (i = 0; i < 256; i++)
9213     {
9214       emacs_mule_bytes[i] = 1;
9215     }
9216   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9217   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9218   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9219   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9220 }
9221
9222 #ifdef emacs
9223
9224 void
9225 syms_of_coding ()
9226 {
9227   staticpro (&Vcoding_system_hash_table);
9228   {
9229     Lisp_Object args[2];
9230     args[0] = QCtest;
9231     args[1] = Qeq;
9232     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9233   }
9234
9235   staticpro (&Vsjis_coding_system);
9236   Vsjis_coding_system = Qnil;
9237
9238   staticpro (&Vbig5_coding_system);
9239   Vbig5_coding_system = Qnil;
9240
9241   staticpro (&Vcode_conversion_reused_workbuf);
9242   Vcode_conversion_reused_workbuf = Qnil;
9243
9244   staticpro (&Vcode_conversion_workbuf_name);
9245   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9246
9247   reused_workbuf_in_use = 0;
9248
9249   DEFSYM (Qcharset, "charset");
9250   DEFSYM (Qtarget_idx, "target-idx");
9251   DEFSYM (Qcoding_system_history, "coding-system-history");
9252   Fset (Qcoding_system_history, Qnil);
9253
9254   /* Target FILENAME is the first argument.  */
9255   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9256   /* Target FILENAME is the third argument.  */
9257   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9258
9259   DEFSYM (Qcall_process, "call-process");
9260   /* Target PROGRAM is the first argument.  */
9261   Fput (Qcall_process, Qtarget_idx, make_number (0));
9262
9263   DEFSYM (Qcall_process_region, "call-process-region");
9264   /* Target PROGRAM is the third argument.  */
9265   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9266
9267   DEFSYM (Qstart_process, "start-process");
9268   /* Target PROGRAM is the third argument.  */
9269   Fput (Qstart_process, Qtarget_idx, make_number (2));
9270
9271   DEFSYM (Qopen_network_stream, "open-network-stream");
9272   /* Target SERVICE is the fourth argument.  */
9273   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9274
9275   DEFSYM (Qcoding_system, "coding-system");
9276   DEFSYM (Qcoding_aliases, "coding-aliases");
9277
9278   DEFSYM (Qeol_type, "eol-type");
9279   DEFSYM (Qunix, "unix");
9280   DEFSYM (Qdos, "dos");
9281
9282   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9283   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9284   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9285   DEFSYM (Qdefault_char, "default-char");
9286   DEFSYM (Qundecided, "undecided");
9287   DEFSYM (Qno_conversion, "no-conversion");
9288   DEFSYM (Qraw_text, "raw-text");
9289
9290   DEFSYM (Qiso_2022, "iso-2022");
9291
9292   DEFSYM (Qutf_8, "utf-8");
9293   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9294
9295   DEFSYM (Qutf_16, "utf-16");
9296   DEFSYM (Qbig, "big");
9297   DEFSYM (Qlittle, "little");
9298
9299   DEFSYM (Qshift_jis, "shift-jis");
9300   DEFSYM (Qbig5, "big5");
9301
9302   DEFSYM (Qcoding_system_p, "coding-system-p");
9303
9304   DEFSYM (Qcoding_system_error, "coding-system-error");
9305   Fput (Qcoding_system_error, Qerror_conditions,
9306         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9307   Fput (Qcoding_system_error, Qerror_message,
9308         build_string ("Invalid coding system"));
9309
9310   /* Intern this now in case it isn't already done.
9311      Setting this variable twice is harmless.
9312      But don't staticpro it here--that is done in alloc.c.  */
9313   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9314
9315   DEFSYM (Qtranslation_table, "translation-table");
9316   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9317   DEFSYM (Qtranslation_table_id, "translation-table-id");
9318   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9319   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9320
9321   DEFSYM (Qvalid_codes, "valid-codes");
9322
9323   DEFSYM (Qemacs_mule, "emacs-mule");
9324
9325   DEFSYM (QCcategory, ":category");
9326   DEFSYM (QCmnemonic, ":mnemonic");
9327   DEFSYM (QCdefalut_char, ":default-char");
9328   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9329   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9330   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9331   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9332   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9333
9334   Vcoding_category_table
9335     = Fmake_vector (make_number (coding_category_max), Qnil);
9336   staticpro (&Vcoding_category_table);
9337   /* Followings are target of code detection.  */
9338   ASET (Vcoding_category_table, coding_category_iso_7,
9339         intern ("coding-category-iso-7"));
9340   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9341         intern ("coding-category-iso-7-tight"));
9342   ASET (Vcoding_category_table, coding_category_iso_8_1,
9343         intern ("coding-category-iso-8-1"));
9344   ASET (Vcoding_category_table, coding_category_iso_8_2,
9345         intern ("coding-category-iso-8-2"));
9346   ASET (Vcoding_category_table, coding_category_iso_7_else,
9347         intern ("coding-category-iso-7-else"));
9348   ASET (Vcoding_category_table, coding_category_iso_8_else,
9349         intern ("coding-category-iso-8-else"));
9350   ASET (Vcoding_category_table, coding_category_utf_8,
9351         intern ("coding-category-utf-8"));
9352   ASET (Vcoding_category_table, coding_category_utf_16_be,
9353         intern ("coding-category-utf-16-be"));
9354   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9355         intern ("coding-category-utf-16-auto"));
9356   ASET (Vcoding_category_table, coding_category_utf_16_le,
9357         intern ("coding-category-utf-16-le"));
9358   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9359         intern ("coding-category-utf-16-be-nosig"));
9360   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9361         intern ("coding-category-utf-16-le-nosig"));
9362   ASET (Vcoding_category_table, coding_category_charset,
9363         intern ("coding-category-charset"));
9364   ASET (Vcoding_category_table, coding_category_sjis,
9365         intern ("coding-category-sjis"));
9366   ASET (Vcoding_category_table, coding_category_big5,
9367         intern ("coding-category-big5"));
9368   ASET (Vcoding_category_table, coding_category_ccl,
9369         intern ("coding-category-ccl"));
9370   ASET (Vcoding_category_table, coding_category_emacs_mule,
9371         intern ("coding-category-emacs-mule"));
9372   /* Followings are NOT target of code detection.  */
9373   ASET (Vcoding_category_table, coding_category_raw_text,
9374         intern ("coding-category-raw-text"));
9375   ASET (Vcoding_category_table, coding_category_undecided,
9376         intern ("coding-category-undecided"));
9377
9378   DEFSYM (Qinsufficient_source, "insufficient-source");
9379   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9380   DEFSYM (Qinvalid_source, "invalid-source");
9381   DEFSYM (Qinterrupted, "interrupted");
9382   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9383   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9384
9385   defsubr (&Scoding_system_p);
9386   defsubr (&Sread_coding_system);
9387   defsubr (&Sread_non_nil_coding_system);
9388   defsubr (&Scheck_coding_system);
9389   defsubr (&Sdetect_coding_region);
9390   defsubr (&Sdetect_coding_string);
9391   defsubr (&Sfind_coding_systems_region_internal);
9392   defsubr (&Sunencodable_char_position);
9393   defsubr (&Scheck_coding_systems_region);
9394   defsubr (&Sdecode_coding_region);
9395   defsubr (&Sencode_coding_region);
9396   defsubr (&Sdecode_coding_string);
9397   defsubr (&Sencode_coding_string);
9398   defsubr (&Sdecode_sjis_char);
9399   defsubr (&Sencode_sjis_char);
9400   defsubr (&Sdecode_big5_char);
9401   defsubr (&Sencode_big5_char);
9402   defsubr (&Sset_terminal_coding_system_internal);
9403   defsubr (&Sset_safe_terminal_coding_system_internal);
9404   defsubr (&Sterminal_coding_system);
9405   defsubr (&Sset_keyboard_coding_system_internal);
9406   defsubr (&Skeyboard_coding_system);
9407   defsubr (&Sfind_operation_coding_system);
9408   defsubr (&Sset_coding_system_priority);
9409   defsubr (&Sdefine_coding_system_internal);
9410   defsubr (&Sdefine_coding_system_alias);
9411   defsubr (&Scoding_system_put);
9412   defsubr (&Scoding_system_base);
9413   defsubr (&Scoding_system_plist);
9414   defsubr (&Scoding_system_aliases);
9415   defsubr (&Scoding_system_eol_type);
9416   defsubr (&Scoding_system_priority_list);
9417
9418   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9419                doc: /* List of coding systems.
9420
9421 Do not alter the value of this variable manually.  This variable should be
9422 updated by the functions `define-coding-system' and
9423 `define-coding-system-alias'.  */);
9424   Vcoding_system_list = Qnil;
9425
9426   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9427                doc: /* Alist of coding system names.
9428 Each element is one element list of coding system name.
9429 This variable is given to `completing-read' as TABLE argument.
9430
9431 Do not alter the value of this variable manually.  This variable should be
9432 updated by the functions `make-coding-system' and
9433 `define-coding-system-alias'.  */);
9434   Vcoding_system_alist = Qnil;
9435
9436   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9437                doc: /* List of coding-categories (symbols) ordered by priority.
9438
9439 On detecting a coding system, Emacs tries code detection algorithms
9440 associated with each coding-category one by one in this order.  When
9441 one algorithm agrees with a byte sequence of source text, the coding
9442 system bound to the corresponding coding-category is selected.
9443
9444 Don't modify this variable directly, but use `set-coding-priority'.  */);
9445   {
9446     int i;
9447
9448     Vcoding_category_list = Qnil;
9449     for (i = coding_category_max - 1; i >= 0; i--)
9450       Vcoding_category_list
9451         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9452                  Vcoding_category_list);
9453   }
9454
9455   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9456                doc: /* Specify the coding system for read operations.
9457 It is useful to bind this variable with `let', but do not set it globally.
9458 If the value is a coding system, it is used for decoding on read operation.
9459 If not, an appropriate element is used from one of the coding system alists:
9460 There are three such tables, `file-coding-system-alist',
9461 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9462   Vcoding_system_for_read = Qnil;
9463
9464   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9465                doc: /* Specify the coding system for write operations.
9466 Programs bind this variable with `let', but you should not set it globally.
9467 If the value is a coding system, it is used for encoding of output,
9468 when writing it to a file and when sending it to a file or subprocess.
9469
9470 If this does not specify a coding system, an appropriate element
9471 is used from one of the coding system alists:
9472 There are three such tables, `file-coding-system-alist',
9473 `process-coding-system-alist', and `network-coding-system-alist'.
9474 For output to files, if the above procedure does not specify a coding system,
9475 the value of `buffer-file-coding-system' is used.  */);
9476   Vcoding_system_for_write = Qnil;
9477
9478   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9479                doc: /*
9480 Coding system used in the latest file or process I/O.  */);
9481   Vlast_coding_system_used = Qnil;
9482
9483   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9484                doc: /*
9485 Error status of the last code conversion.
9486
9487 When an error was detected in the last code conversion, this variable
9488 is set to one of the following symbols.
9489   `insufficient-source'
9490   `inconsistent-eol'
9491   `invalid-source'
9492   `interrupted'
9493   `insufficient-memory'
9494 When no error was detected, the value doesn't change.  So, to check
9495 the error status of a code conversion by this variable, you must
9496 explicitly set this variable to nil before performing code
9497 conversion.  */);
9498   Vlast_code_conversion_error = Qnil;
9499
9500   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9501                doc: /*
9502 *Non-nil means always inhibit code conversion of end-of-line format.
9503 See info node `Coding Systems' and info node `Text and Binary' concerning
9504 such conversion.  */);
9505   inhibit_eol_conversion = 0;
9506
9507   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9508                doc: /*
9509 Non-nil means process buffer inherits coding system of process output.
9510 Bind it to t if the process output is to be treated as if it were a file
9511 read from some filesystem.  */);
9512   inherit_process_coding_system = 0;
9513
9514   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9515                doc: /*
9516 Alist to decide a coding system to use for a file I/O operation.
9517 The format is ((PATTERN . VAL) ...),
9518 where PATTERN is a regular expression matching a file name,
9519 VAL is a coding system, a cons of coding systems, or a function symbol.
9520 If VAL is a coding system, it is used for both decoding and encoding
9521 the file contents.
9522 If VAL is a cons of coding systems, the car part is used for decoding,
9523 and the cdr part is used for encoding.
9524 If VAL is a function symbol, the function must return a coding system
9525 or a cons of coding systems which are used as above.  The function gets
9526 the arguments with which `find-operation-coding-systems' was called.
9527
9528 See also the function `find-operation-coding-system'
9529 and the variable `auto-coding-alist'.  */);
9530   Vfile_coding_system_alist = Qnil;
9531
9532   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9533                doc: /*
9534 Alist to decide a coding system to use for a process I/O operation.
9535 The format is ((PATTERN . VAL) ...),
9536 where PATTERN is a regular expression matching a program name,
9537 VAL is a coding system, a cons of coding systems, or a function symbol.
9538 If VAL is a coding system, it is used for both decoding what received
9539 from the program and encoding what sent to the program.
9540 If VAL is a cons of coding systems, the car part is used for decoding,
9541 and the cdr part is used for encoding.
9542 If VAL is a function symbol, the function must return a coding system
9543 or a cons of coding systems which are used as above.
9544
9545 See also the function `find-operation-coding-system'.  */);
9546   Vprocess_coding_system_alist = Qnil;
9547
9548   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9549                doc: /*
9550 Alist to decide a coding system to use for a network I/O operation.
9551 The format is ((PATTERN . VAL) ...),
9552 where PATTERN is a regular expression matching a network service name
9553 or is a port number to connect to,
9554 VAL is a coding system, a cons of coding systems, or a function symbol.
9555 If VAL is a coding system, it is used for both decoding what received
9556 from the network stream and encoding what sent to the network stream.
9557 If VAL is a cons of coding systems, the car part is used for decoding,
9558 and the cdr part is used for encoding.
9559 If VAL is a function symbol, the function must return a coding system
9560 or a cons of coding systems which are used as above.
9561
9562 See also the function `find-operation-coding-system'.  */);
9563   Vnetwork_coding_system_alist = Qnil;
9564
9565   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9566                doc: /* Coding system to use with system messages.
9567 Also used for decoding keyboard input on X Window system.  */);
9568   Vlocale_coding_system = Qnil;
9569
9570   /* The eol mnemonics are reset in startup.el system-dependently.  */
9571   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9572                doc: /*
9573 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9574   eol_mnemonic_unix = build_string (":");
9575
9576   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9577                doc: /*
9578 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9579   eol_mnemonic_dos = build_string ("\\");
9580
9581   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9582                doc: /*
9583 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9584   eol_mnemonic_mac = build_string ("/");
9585
9586   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9587                doc: /*
9588 *String displayed in mode line when end-of-line format is not yet determined.  */);
9589   eol_mnemonic_undecided = build_string (":");
9590
9591   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9592                doc: /*
9593 *Non-nil enables character translation while encoding and decoding.  */);
9594   Venable_character_translation = Qt;
9595
9596   DEFVAR_LISP ("standard-translation-table-for-decode",
9597                &Vstandard_translation_table_for_decode,
9598                doc: /* Table for translating characters while decoding.  */);
9599   Vstandard_translation_table_for_decode = Qnil;
9600
9601   DEFVAR_LISP ("standard-translation-table-for-encode",
9602                &Vstandard_translation_table_for_encode,
9603                doc: /* Table for translating characters while encoding.  */);
9604   Vstandard_translation_table_for_encode = Qnil;
9605
9606   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9607                doc: /* Alist of charsets vs revision numbers.
9608 While encoding, if a charset (car part of an element) is found,
9609 designate it with the escape sequence identifying revision (cdr part
9610 of the element).  */);
9611   Vcharset_revision_table = Qnil;
9612
9613   DEFVAR_LISP ("default-process-coding-system",
9614                &Vdefault_process_coding_system,
9615                doc: /* Cons of coding systems used for process I/O by default.
9616 The car part is used for decoding a process output,
9617 the cdr part is used for encoding a text to be sent to a process.  */);
9618   Vdefault_process_coding_system = Qnil;
9619
9620   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9621                doc: /*
9622 Table of extra Latin codes in the range 128..159 (inclusive).
9623 This is a vector of length 256.
9624 If Nth element is non-nil, the existence of code N in a file
9625 \(or output of subprocess) doesn't prevent it to be detected as
9626 a coding system of ISO 2022 variant which has a flag
9627 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9628 or reading output of a subprocess.
9629 Only 128th through 159th elements has a meaning.  */);
9630   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9631
9632   DEFVAR_LISP ("select-safe-coding-system-function",
9633                &Vselect_safe_coding_system_function,
9634                doc: /*
9635 Function to call to select safe coding system for encoding a text.
9636
9637 If set, this function is called to force a user to select a proper
9638 coding system which can encode the text in the case that a default
9639 coding system used in each operation can't encode the text.
9640
9641 The default value is `select-safe-coding-system' (which see).  */);
9642   Vselect_safe_coding_system_function = Qnil;
9643
9644   DEFVAR_BOOL ("coding-system-require-warning",
9645                &coding_system_require_warning,
9646                doc: /* Internal use only.
9647 If non-nil, on writing a file, `select-safe-coding-system-function' is
9648 called even if `coding-system-for-write' is non-nil.  The command
9649 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9650   coding_system_require_warning = 0;
9651
9652
9653   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9654                &inhibit_iso_escape_detection,
9655                doc: /*
9656 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9657
9658 By default, on reading a file, Emacs tries to detect how the text is
9659 encoded.  This code detection is sensitive to escape sequences.  If
9660 the sequence is valid as ISO2022, the code is determined as one of
9661 the ISO2022 encodings, and the file is decoded by the corresponding
9662 coding system (e.g. `iso-2022-7bit').
9663
9664 However, there may be a case that you want to read escape sequences in
9665 a file as is.  In such a case, you can set this variable to non-nil.
9666 Then, as the code detection ignores any escape sequences, no file is
9667 detected as encoded in some ISO2022 encoding.  The result is that all
9668 escape sequences become visible in a buffer.
9669
9670 The default value is nil, and it is strongly recommended not to change
9671 it.  That is because many Emacs Lisp source files that contain
9672 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9673 in Emacs's distribution, and they won't be decoded correctly on
9674 reading if you suppress escape sequence detection.
9675
9676 The other way to read escape sequences in a file without decoding is
9677 to explicitly specify some coding system that doesn't use ISO2022's
9678 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9679   inhibit_iso_escape_detection = 0;
9680
9681   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9682                doc: /* Char table for translating self-inserting characters.
9683 This is applied to the result of input methods, not their input.  See also
9684 `keyboard-translate-table'.  */);
9685     Vtranslation_table_for_input = Qnil;
9686
9687   {
9688     Lisp_Object args[coding_arg_max];
9689     Lisp_Object plist[16];
9690     int i;
9691
9692     for (i = 0; i < coding_arg_max; i++)
9693       args[i] = Qnil;
9694
9695     plist[0] = intern (":name");
9696     plist[1] = args[coding_arg_name] = Qno_conversion;
9697     plist[2] = intern (":mnemonic");
9698     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9699     plist[4] = intern (":coding-type");
9700     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9701     plist[6] = intern (":ascii-compatible-p");
9702     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9703     plist[8] = intern (":default-char");
9704     plist[9] = args[coding_arg_default_char] = make_number (0);
9705     plist[10] = intern (":for-unibyte");
9706     plist[11] = args[coding_arg_for_unibyte] = Qt;
9707     plist[12] = intern (":docstring");
9708     plist[13] = build_string ("Do no conversion.\n\
9709 \n\
9710 When you visit a file with this coding, the file is read into a\n\
9711 unibyte buffer as is, thus each byte of a file is treated as a\n\
9712 character.");
9713     plist[14] = intern (":eol-type");
9714     plist[15] = args[coding_arg_eol_type] = Qunix;
9715     args[coding_arg_plist] = Flist (16, plist);
9716     Fdefine_coding_system_internal (coding_arg_max, args);
9717
9718     plist[1] = args[coding_arg_name] = Qundecided;
9719     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9720     plist[5] = args[coding_arg_coding_type] = Qundecided;
9721     /* This is already set.
9722        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9723     plist[8] = intern (":charset-list");
9724     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9725     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9726     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9727     plist[15] = args[coding_arg_eol_type] = Qnil;
9728     args[coding_arg_plist] = Flist (16, plist);
9729     Fdefine_coding_system_internal (coding_arg_max, args);
9730   }
9731
9732   setup_coding_system (Qno_conversion, &keyboard_coding);
9733   setup_coding_system (Qundecided, &terminal_coding);
9734   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9735
9736   {
9737     int i;
9738
9739     for (i = 0; i < coding_category_max; i++)
9740       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9741   }
9742 }
9743
9744 char *
9745 emacs_strerror (error_number)
9746      int error_number;
9747 {
9748   char *str;
9749
9750   synchronize_system_messages_locale ();
9751   str = strerror (error_number);
9752
9753   if (! NILP (Vlocale_coding_system))
9754     {
9755       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9756                                                       Vlocale_coding_system,
9757                                                       0);
9758       str = (char *) SDATA (dec);
9759     }
9760
9761   return str;
9762 }
9763
9764 #endif /* emacs */
9765
9766 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9767    (do not change this comment) */