src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005,
   3                  2006 Free Software Foundation, Inc.
   4    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   5      National Institute of Advanced Industrial Science and Technology (AIST)
   6      Registration Number H14PRO021
   7    Copyright (C) 2003
   8      National Institute of Advanced Industrial Science and Technology (AIST)
   9      Registration Number H13PRO009
  10
  11 This file is part of GNU Emacs.
  12
  13 GNU Emacs is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU General Public License as published by
  15 the Free Software Foundation; either version 2, or (at your option)
  16 any later version.
  17
  18 GNU Emacs is distributed in the hope that it will be useful,
  19 but WITHOUT ANY WARRANTY; without even the implied warranty of
  20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  21 GNU General Public License for more details.
  22
  23 You should have received a copy of the GNU General Public License
  24 along with GNU Emacs; see the file COPYING.  If not, write to
  25 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  26 Boston, MA 02110-1301, USA.  */
  27
  28 /*** TABLE OF CONTENTS ***
  29
  30   0. General comments
  31   1. Preamble
  32   2. Emacs' internal format (emacs-utf-8) handlers
  33   3. UTF-8 handlers
  34   4. UTF-16 handlers
  35   5. Charset-base coding systems handlers
  36   6. emacs-mule (old Emacs' internal format) handlers
  37   7. ISO2022 handlers
  38   8. Shift-JIS and BIG5 handlers
  39   9. CCL handlers
  40   10. C library functions
  41   11. Emacs Lisp library functions
  42   12. Postamble
  43
  44 */
  45
  46 /*** 0. General comments ***
  47
  48
  49 CODING SYSTEM
  50
  51   A coding system is an object for an encoding mechanism that contains
  52   information about how to convert byte sequences to character
  53   sequences and vice versa.  When we say "decode", it means converting
  54   a byte sequence of a specific coding system into a character
  55   sequence that is represented by Emacs' internal coding system
  56   `emacs-utf-8', and when we say "encode", it means converting a
  57   character sequence of emacs-utf-8 to a byte sequence of a specific
  58   coding system.
  59
  60   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  61   C level, a coding system is represented by a vector of attributes
  62   stored in the hash table Vcharset_hash_table.  The conversion from
  63   coding system symbol to attributes vector is done by looking up
  64   Vcharset_hash_table by the symbol.
  65
  66   Coding systems are classified into the following types depending on
  67   the encoding mechanism.  Here's a brief description of the types.
  68
  69   o UTF-8
  70
  71   o UTF-16
  72
  73   o Charset-base coding system
  74
  75   A coding system defined by one or more (coded) character sets.
  76   Decoding and encoding are done by a code converter defined for each
  77   character set.
  78
  79   o Old Emacs internal format (emacs-mule)
  80
  81   The coding system adopted by old versions of Emacs (20 and 21).
  82
  83   o ISO2022-base coding system
  84
  85   The most famous coding system for multiple character sets.  X's
  86   Compound Text, various EUCs (Extended Unix Code), and coding systems
  87   used in the Internet communication such as ISO-2022-JP are all
  88   variants of ISO2022.
  89
  90   o SJIS (or Shift-JIS or MS-Kanji-Code)
  91
  92   A coding system to encode character sets: ASCII, JISX0201, and
  93   JISX0208.  Widely used for PC's in Japan.  Details are described in
  94   section 8.
  95
  96   o BIG5
  97
  98   A coding system to encode character sets: ASCII and Big5.  Widely
  99   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
 100   described in section 8.  In this file, when we write "big5" (all
 101   lowercase), we mean the coding system, and when we write "Big5"
 102   (capitalized), we mean the character set.
 103
 104   o CCL
 105
 106   If a user wants to decode/encode text encoded in a coding system
 107   not listed above, he can supply a decoder and an encoder for it in
 108   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 109   program while decoding/encoding.
 110
 111   o Raw-text
 112
 113   A coding system for text containing raw eight-bit data.  Emacs
 114   treats each byte of source text as a character (except for
 115   end-of-line conversion).
 116
 117   o No-conversion
 118
 119   Like raw text, but don't do end-of-line conversion.
 120
 121
 122 END-OF-LINE FORMAT
 123
 124   How text end-of-line is encoded depends on operating system.  For
 125   instance, Unix's format is just one byte of LF (line-feed) code,
 126   whereas DOS's format is two-byte sequence of `carriage-return' and
 127   `line-feed' codes.  MacOS's format is usually one byte of
 128   `carriage-return'.
 129
 130   Since text character encoding and end-of-line encoding are
 131   independent, any coding system described above can take any format
 132   of end-of-line (except for no-conversion).
 133
 134 STRUCT CODING_SYSTEM
 135
 136   Before using a coding system for code conversion (i.e. decoding and
 137   encoding), we setup a structure of type `struct coding_system'.
 138   This structure keeps various information about a specific code
 139   conversion (e.g. the location of source and destination data).
 140
 141 */
 142
 143 /* COMMON MACROS */
 144
 145
 146 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 147
 148   These functions check if a byte sequence specified as a source in
 149   CODING conforms to the format of XXX, and update the members of
 150   DETECT_INFO.
 151
 152   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 153
 154   Below is the template of these functions.  */
 155
 156 #if 0
 157 static int
 158 detect_coding_XXX (coding, detect_info)
 159      struct coding_system *coding;
 160      struct coding_detection_info *detect_info;
 161 {
 162   const unsigned char *src = coding->source;
 163   const unsigned char *src_end = coding->source + coding->src_bytes;
 164   int multibytep = coding->src_multibyte;
 165   int consumed_chars = 0;
 166   int found = 0;
 167   ...;
 168
 169   while (1)
 170     {
 171       /* Get one byte from the source.  If the souce is exausted, jump
 172          to no_more_source:.  */
 173       ONE_MORE_BYTE (c);
 174
 175       if (! __C_conforms_to_XXX___ (c))
 176         break;
 177       if (! __C_strongly_suggests_XXX__ (c))
 178         found = CATEGORY_MASK_XXX;
 179     }
 180   /* The byte sequence is invalid for XXX.  */
 181   detect_info->rejected |= CATEGORY_MASK_XXX;
 182   return 0;
 183
 184  no_more_source:
 185   /* The source exausted successfully.  */
 186   detect_info->found |= found;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   const unsigned char *src = coding->source + coding->consumed;
 211   const unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   const unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf + coding->charbuf_used;
 218   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 317 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 318 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 319 Lisp_Object QCascii_compatible_p;
 320
 321 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 322 Lisp_Object Qcall_process, Qcall_process_region;
 323 Lisp_Object Qstart_process, Qopen_network_stream;
 324 Lisp_Object Qtarget_idx;
 325
 326 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 327 Lisp_Object Qinterrupted, Qinsufficient_memory;
 328
 329 /* If a symbol has this property, evaluate the value to define the
 330    symbol as a coding system.  */
 331 static Lisp_Object Qcoding_system_define_form;
 332
 333 int coding_system_require_warning;
 334
 335 Lisp_Object Vselect_safe_coding_system_function;
 336
 337 /* Mnemonic string for each format of end-of-line.  */
 338 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 339 /* Mnemonic string to indicate format of end-of-line is not yet
 340    decided.  */
 341 Lisp_Object eol_mnemonic_undecided;
 342
 343 #ifdef emacs
 344
 345 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 346
 347 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 348
 349 /* Coding system emacs-mule and raw-text are for converting only
 350    end-of-line format.  */
 351 Lisp_Object Qemacs_mule, Qraw_text;
 352 Lisp_Object Qutf_8_emacs;
 353
 354 /* Coding-systems are handed between Emacs Lisp programs and C internal
 355    routines by the following three variables.  */
 356 /* Coding-system for reading files and receiving data from process.  */
 357 Lisp_Object Vcoding_system_for_read;
 358 /* Coding-system for writing files and sending data to process.  */
 359 Lisp_Object Vcoding_system_for_write;
 360 /* Coding-system actually used in the latest I/O.  */
 361 Lisp_Object Vlast_coding_system_used;
 362 /* Set to non-nil when an error is detected while code conversion.  */
 363 Lisp_Object Vlast_code_conversion_error;
 364 /* A vector of length 256 which contains information about special
 365    Latin codes (especially for dealing with Microsoft codes).  */
 366 Lisp_Object Vlatin_extra_code_table;
 367
 368 /* Flag to inhibit code conversion of end-of-line format.  */
 369 int inhibit_eol_conversion;
 370
 371 /* Flag to inhibit ISO2022 escape sequence detection.  */
 372 int inhibit_iso_escape_detection;
 373
 374 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 375 int inherit_process_coding_system;
 376
 377 /* Coding system to be used to encode text for terminal display.  */
 378 struct coding_system terminal_coding;
 379
 380 /* Coding system to be used to encode text for terminal display when
 381    terminal coding system is nil.  */
 382 struct coding_system safe_terminal_coding;
 383
 384 /* Coding system of what is sent from terminal keyboard.  */
 385 struct coding_system keyboard_coding;
 386
 387 Lisp_Object Vfile_coding_system_alist;
 388 Lisp_Object Vprocess_coding_system_alist;
 389 Lisp_Object Vnetwork_coding_system_alist;
 390
 391 Lisp_Object Vlocale_coding_system;
 392
 393 #endif /* emacs */
 394
 395 /* Flag to tell if we look up translation table on character code
 396    conversion.  */
 397 Lisp_Object Venable_character_translation;
 398 /* Standard translation table to look up on decoding (reading).  */
 399 Lisp_Object Vstandard_translation_table_for_decode;
 400 /* Standard translation table to look up on encoding (writing).  */
 401 Lisp_Object Vstandard_translation_table_for_encode;
 402
 403 Lisp_Object Qtranslation_table;
 404 Lisp_Object Qtranslation_table_id;
 405 Lisp_Object Qtranslation_table_for_decode;
 406 Lisp_Object Qtranslation_table_for_encode;
 407
 408 /* Alist of charsets vs revision number.  */
 409 static Lisp_Object Vcharset_revision_table;
 410
 411 /* Default coding systems used for process I/O.  */
 412 Lisp_Object Vdefault_process_coding_system;
 413
 414 /* Char table for translating Quail and self-inserting input.  */
 415 Lisp_Object Vtranslation_table_for_input;
 416
 417 /* Two special coding systems.  */
 418 Lisp_Object Vsjis_coding_system;
 419 Lisp_Object Vbig5_coding_system;
 420
 421 /* ISO2022 section */
 422
 423 #define CODING_ISO_INITIAL(coding, reg)                 \
 424   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 425                      coding_attr_iso_initial),          \
 426                reg)))
 427
 428
 429 #define CODING_ISO_REQUEST(coding, charset_id)  \
 430   ((charset_id <= (coding)->max_charset_id      \
 431     ? (coding)->safe_charsets[charset_id]       \
 432     : -1))
 433
 434
 435 #define CODING_ISO_FLAGS(coding)        \
 436   ((coding)->spec.iso_2022.flags)
 437 #define CODING_ISO_DESIGNATION(coding, reg)     \
 438   ((coding)->spec.iso_2022.current_designation[reg])
 439 #define CODING_ISO_INVOCATION(coding, plane)    \
 440   ((coding)->spec.iso_2022.current_invocation[plane])
 441 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 442   ((coding)->spec.iso_2022.single_shifting)
 443 #define CODING_ISO_BOL(coding)  \
 444   ((coding)->spec.iso_2022.bol)
 445 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 446   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 447
 448 /* Control characters of ISO2022.  */
 449                         /* code */      /* function */
 450 #define ISO_CODE_LF     0x0A            /* line-feed */
 451 #define ISO_CODE_CR     0x0D            /* carriage-return */
 452 #define ISO_CODE_SO     0x0E            /* shift-out */
 453 #define ISO_CODE_SI     0x0F            /* shift-in */
 454 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 455 #define ISO_CODE_ESC    0x1B            /* escape */
 456 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 457 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 458 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 459
 460 /* All code (1-byte) of ISO2022 is classified into one of the
 461    followings.  */
 462 enum iso_code_class_type
 463   {
 464     ISO_control_0,              /* Control codes in the range
 465                                    0x00..0x1F and 0x7F, except for the
 466                                    following 5 codes.  */
 467     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 468     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 469     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 470     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 471     ISO_control_1,              /* Control codes in the range
 472                                    0x80..0x9F, except for the
 473                                    following 3 codes.  */
 474     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 475     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 476     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 477     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 478     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 479     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 480     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 481   };
 482
 483 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 484     `iso-flags' attribute of an iso2022 coding system.  */
 485
 486 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 487    instead of the correct short-form sequence (e.g. ESC $ A).  */
 488 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 489
 490 /* If set, reset graphic planes and registers at end-of-line to the
 491    initial state.  */
 492 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 493
 494 /* If set, reset graphic planes and registers before any control
 495    characters to the initial state.  */
 496 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 497
 498 /* If set, encode by 7-bit environment.  */
 499 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 500
 501 /* If set, use locking-shift function.  */
 502 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 503
 504 /* If set, use single-shift function.  Overwrite
 505    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 506 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 507
 508 /* If set, use designation escape sequence.  */
 509 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 510
 511 /* If set, produce revision number sequence.  */
 512 #define CODING_ISO_FLAG_REVISION        0x0080
 513
 514 /* If set, produce ISO6429's direction specifying sequence.  */
 515 #define CODING_ISO_FLAG_DIRECTION       0x0100
 516
 517 /* If set, assume designation states are reset at beginning of line on
 518    output.  */
 519 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 520
 521 /* If set, designation sequence should be placed at beginning of line
 522    on output.  */
 523 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 524
 525 /* If set, do not encode unsafe charactes on output.  */
 526 #define CODING_ISO_FLAG_SAFE            0x0800
 527
 528 /* If set, extra latin codes (128..159) are accepted as a valid code
 529    on input.  */
 530 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 531
 532 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 533
 534 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 535
 536 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 537
 538 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 539
 540 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 541
 542 /* A character to be produced on output if encoding of the original
 543    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 544 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 545
 546
 547 /* UTF-16 section */
 548 #define CODING_UTF_16_BOM(coding)       \
 549   ((coding)->spec.utf_16.bom)
 550
 551 #define CODING_UTF_16_ENDIAN(coding)    \
 552   ((coding)->spec.utf_16.endian)
 553
 554 #define CODING_UTF_16_SURROGATE(coding) \
 555   ((coding)->spec.utf_16.surrogate)
 556
 557
 558 /* CCL section */
 559 #define CODING_CCL_DECODER(coding)      \
 560   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 561 #define CODING_CCL_ENCODER(coding)      \
 562   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 563 #define CODING_CCL_VALIDS(coding)                                          \
 564   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 565
 566 /* Index for each coding category in `coding_categories' */
 567
 568 enum coding_category
 569   {
 570     coding_category_iso_7,
 571     coding_category_iso_7_tight,
 572     coding_category_iso_8_1,
 573     coding_category_iso_8_2,
 574     coding_category_iso_7_else,
 575     coding_category_iso_8_else,
 576     coding_category_utf_8,
 577     coding_category_utf_16_auto,
 578     coding_category_utf_16_be,
 579     coding_category_utf_16_le,
 580     coding_category_utf_16_be_nosig,
 581     coding_category_utf_16_le_nosig,
 582     coding_category_charset,
 583     coding_category_sjis,
 584     coding_category_big5,
 585     coding_category_ccl,
 586     coding_category_emacs_mule,
 587     /* All above are targets of code detection.  */
 588     coding_category_raw_text,
 589     coding_category_undecided,
 590     coding_category_max
 591   };
 592
 593 /* Definitions of flag bits used in detect_coding_XXXX.  */
 594 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 595 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 596 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 597 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 598 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 599 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 600 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 601 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 602 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 603 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 604 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 605 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 606 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 607 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 608 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 609 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 610 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 611 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 612
 613 /* This value is returned if detect_coding_mask () find nothing other
 614    than ASCII characters.  */
 615 #define CATEGORY_MASK_ANY               \
 616   (CATEGORY_MASK_ISO_7                  \
 617    | CATEGORY_MASK_ISO_7_TIGHT          \
 618    | CATEGORY_MASK_ISO_8_1              \
 619    | CATEGORY_MASK_ISO_8_2              \
 620    | CATEGORY_MASK_ISO_7_ELSE           \
 621    | CATEGORY_MASK_ISO_8_ELSE           \
 622    | CATEGORY_MASK_UTF_8                \
 623    | CATEGORY_MASK_UTF_16_BE            \
 624    | CATEGORY_MASK_UTF_16_LE            \
 625    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 626    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 627    | CATEGORY_MASK_CHARSET              \
 628    | CATEGORY_MASK_SJIS                 \
 629    | CATEGORY_MASK_BIG5                 \
 630    | CATEGORY_MASK_CCL                  \
 631    | CATEGORY_MASK_EMACS_MULE)
 632
 633
 634 #define CATEGORY_MASK_ISO_7BIT \
 635   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 636
 637 #define CATEGORY_MASK_ISO_8BIT \
 638   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 639
 640 #define CATEGORY_MASK_ISO_ELSE \
 641   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 642
 643 #define CATEGORY_MASK_ISO_ESCAPE        \
 644   (CATEGORY_MASK_ISO_7                  \
 645    | CATEGORY_MASK_ISO_7_TIGHT          \
 646    | CATEGORY_MASK_ISO_7_ELSE           \
 647    | CATEGORY_MASK_ISO_8_ELSE)
 648
 649 #define CATEGORY_MASK_ISO       \
 650   (  CATEGORY_MASK_ISO_7BIT     \
 651      | CATEGORY_MASK_ISO_8BIT   \
 652      | CATEGORY_MASK_ISO_ELSE)
 653
 654 #define CATEGORY_MASK_UTF_16            \
 655   (CATEGORY_MASK_UTF_16_BE              \
 656    | CATEGORY_MASK_UTF_16_LE            \
 657    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 658    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 659
 660
 661 /* List of symbols `coding-category-xxx' ordered by priority.  This
 662    variable is exposed to Emacs Lisp.  */
 663 static Lisp_Object Vcoding_category_list;
 664
 665 /* Table of coding categories (Lisp symbols).  This variable is for
 666    internal use oly.  */
 667 static Lisp_Object Vcoding_category_table;
 668
 669 /* Table of coding-categories ordered by priority.  */
 670 static enum coding_category coding_priorities[coding_category_max];
 671
 672 /* Nth element is a coding context for the coding system bound to the
 673    Nth coding category.  */
 674 static struct coding_system coding_categories[coding_category_max];
 675
 676 /*** Commonly used macros and functions ***/
 677
 678 #ifndef min
 679 #define min(a, b) ((a) < (b) ? (a) : (b))
 680 #endif
 681 #ifndef max
 682 #define max(a, b) ((a) > (b) ? (a) : (b))
 683 #endif
 684
 685 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 686   do {                                                  \
 687     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 688     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 689   } while (0)
 690
 691
 692 /* Safely get one byte from the source text pointed by SRC which ends
 693    at SRC_END, and set C to that byte.  If there are not enough bytes
 694    in the source, it jumps to `no_more_source'.  If multibytep is
 695    nonzero, and a multibyte character is found at SRC, set C to the
 696    negative value of the character code.  The caller should declare
 697    and set these variables appropriately in advance:
 698         src, src_end, multibytep */
 699
 700 #define ONE_MORE_BYTE(c)                                \
 701   do {                                                  \
 702     if (src == src_end)                                 \
 703       {                                                 \
 704         if (src_base < src)                             \
 705           record_conversion_result                      \
 706             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 707         goto no_more_source;                            \
 708       }                                                 \
 709     c = *src++;                                         \
 710     if (multibytep && (c & 0x80))                       \
 711       {                                                 \
 712         if ((c & 0xFE) == 0xC0)                         \
 713           c = ((c & 1) << 6) | *src++;                  \
 714         else                                            \
 715           {                                             \
 716             src--;                                      \
 717             c = - string_char (src, &src, NULL);        \
 718             record_conversion_result                    \
 719               (coding, CODING_RESULT_INVALID_SRC);      \
 720           }                                             \
 721       }                                                 \
 722     consumed_chars++;                                   \
 723   } while (0)
 724
 725
 726 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 727   do {                                                  \
 728     c = *src++;                                         \
 729     if (multibytep && (c & 0x80))                       \
 730       {                                                 \
 731         if ((c & 0xFE) == 0xC0)                         \
 732           c = ((c & 1) << 6) | *src++;                  \
 733         else                                            \
 734           {                                             \
 735             src--;                                      \
 736             c = - string_char (src, &src, NULL);        \
 737             record_conversion_result                    \
 738               (coding, CODING_RESULT_INVALID_SRC);      \
 739           }                                             \
 740       }                                                 \
 741     consumed_chars++;                                   \
 742   } while (0)
 743
 744
 745 /* Store a byte C in the place pointed by DST and increment DST to the
 746    next free point, and increment PRODUCED_CHARS.  The caller should
 747    assure that C is 0..127, and declare and set the variable `dst'
 748    appropriately in advance.
 749 */
 750
 751
 752 #define EMIT_ONE_ASCII_BYTE(c)  \
 753   do {                          \
 754     produced_chars++;           \
 755     *dst++ = (c);               \
 756   } while (0)
 757
 758
 759 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 760
 761 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 762   do {                                  \
 763     produced_chars += 2;                \
 764     *dst++ = (c1), *dst++ = (c2);       \
 765   } while (0)
 766
 767
 768 /* Store a byte C in the place pointed by DST and increment DST to the
 769    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 770    nonzero, store in an appropriate multibyte from.  The caller should
 771    declare and set the variables `dst' and `multibytep' appropriately
 772    in advance.  */
 773
 774 #define EMIT_ONE_BYTE(c)                \
 775   do {                                  \
 776     produced_chars++;                   \
 777     if (multibytep)                     \
 778       {                                 \
 779         int ch = (c);                   \
 780         if (ch >= 0x80)                 \
 781           ch = BYTE8_TO_CHAR (ch);      \
 782         CHAR_STRING_ADVANCE (ch, dst);  \
 783       }                                 \
 784     else                                \
 785       *dst++ = (c);                     \
 786   } while (0)
 787
 788
 789 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 790
 791 #define EMIT_TWO_BYTES(c1, c2)          \
 792   do {                                  \
 793     produced_chars += 2;                \
 794     if (multibytep)                     \
 795       {                                 \
 796         int ch;                         \
 797                                         \
 798         ch = (c1);                      \
 799         if (ch >= 0x80)                 \
 800           ch = BYTE8_TO_CHAR (ch);      \
 801         CHAR_STRING_ADVANCE (ch, dst);  \
 802         ch = (c2);                      \
 803         if (ch >= 0x80)                 \
 804           ch = BYTE8_TO_CHAR (ch);      \
 805         CHAR_STRING_ADVANCE (ch, dst);  \
 806       }                                 \
 807     else                                \
 808       {                                 \
 809         *dst++ = (c1);                  \
 810         *dst++ = (c2);                  \
 811       }                                 \
 812   } while (0)
 813
 814
 815 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 816   do {                                  \
 817     EMIT_ONE_BYTE (c1);                 \
 818     EMIT_TWO_BYTES (c2, c3);            \
 819   } while (0)
 820
 821
 822 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 823   do {                                          \
 824     EMIT_TWO_BYTES (c1, c2);                    \
 825     EMIT_TWO_BYTES (c3, c4);                    \
 826   } while (0)
 827
 828
 829 /* Prototypes for static functions.  */
 830 static void record_conversion_result P_ ((struct coding_system *coding,
 831                                           enum coding_result_code result));
 832 static int detect_coding_utf_8 P_ ((struct coding_system *,
 833                                     struct coding_detection_info *info));
 834 static void decode_coding_utf_8 P_ ((struct coding_system *));
 835 static int encode_coding_utf_8 P_ ((struct coding_system *));
 836
 837 static int detect_coding_utf_16 P_ ((struct coding_system *,
 838                                      struct coding_detection_info *info));
 839 static void decode_coding_utf_16 P_ ((struct coding_system *));
 840 static int encode_coding_utf_16 P_ ((struct coding_system *));
 841
 842 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 843                                        struct coding_detection_info *info));
 844 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 845 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 846
 847 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 848                                          struct coding_detection_info *info));
 849 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 850 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 851
 852 static int detect_coding_sjis P_ ((struct coding_system *,
 853                                    struct coding_detection_info *info));
 854 static void decode_coding_sjis P_ ((struct coding_system *));
 855 static int encode_coding_sjis P_ ((struct coding_system *));
 856
 857 static int detect_coding_big5 P_ ((struct coding_system *,
 858                                    struct coding_detection_info *info));
 859 static void decode_coding_big5 P_ ((struct coding_system *));
 860 static int encode_coding_big5 P_ ((struct coding_system *));
 861
 862 static int detect_coding_ccl P_ ((struct coding_system *,
 863                                   struct coding_detection_info *info));
 864 static void decode_coding_ccl P_ ((struct coding_system *));
 865 static int encode_coding_ccl P_ ((struct coding_system *));
 866
 867 static void decode_coding_raw_text P_ ((struct coding_system *));
 868 static int encode_coding_raw_text P_ ((struct coding_system *));
 869
 870 static void coding_set_source P_ ((struct coding_system *));
 871 static void coding_set_destination P_ ((struct coding_system *));
 872 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 873 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 874                                             EMACS_INT));
 875 static unsigned char *alloc_destination P_ ((struct coding_system *,
 876                                              EMACS_INT, unsigned char *));
 877 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 878 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 879                                                      int *, int *,
 880                                                      unsigned char *));
 881 static int detect_eol P_ ((const unsigned char *,
 882                            EMACS_INT, enum coding_category));
 883 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 884 static void decode_eol P_ ((struct coding_system *));
 885 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 886 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 887                                         int, int *, int *));
 888 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 889 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 890                                             EMACS_INT));
 891 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 892                                         EMACS_INT));
 893 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 894 static int decode_coding P_ ((struct coding_system *));
 895 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 896                                                       struct coding_system *,
 897                                                       int *, EMACS_INT *));
 898 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 899                                                   struct coding_system *,
 900                                                   int *, EMACS_INT *));
 901 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 902 static int encode_coding P_ ((struct coding_system *));
 903 static Lisp_Object make_conversion_work_buffer P_ ((int));
 904 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 905 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 906 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 907
 908 static void
 909 record_conversion_result (struct coding_system *coding,
 910                           enum coding_result_code result)
 911 {
 912   coding->result = result;
 913   switch (result)
 914     {
 915     case CODING_RESULT_INSUFFICIENT_SRC:
 916       Vlast_code_conversion_error = Qinsufficient_source;
 917       break;
 918     case CODING_RESULT_INCONSISTENT_EOL:
 919       Vlast_code_conversion_error = Qinconsistent_eol;
 920       break;
 921     case CODING_RESULT_INVALID_SRC:
 922       Vlast_code_conversion_error = Qinvalid_source;
 923       break;
 924     case CODING_RESULT_INTERRUPT:
 925       Vlast_code_conversion_error = Qinterrupted;
 926       break;
 927     case CODING_RESULT_INSUFFICIENT_MEM:
 928       Vlast_code_conversion_error = Qinsufficient_memory;
 929       break;
 930     default:
 931       Vlast_code_conversion_error = intern ("Unknown error");
 932     }
 933 }
 934
 935 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 936   do {                                                                       \
 937     charset_map_loaded = 0;                                                  \
 938     c = DECODE_CHAR (charset, code);                                         \
 939     if (charset_map_loaded)                                                  \
 940       {                                                                      \
 941         const unsigned char *orig = coding->source;                          \
 942         EMACS_INT offset;                                                    \
 943                                                                              \
 944         coding_set_source (coding);                                          \
 945         offset = coding->source - orig;                                      \
 946         src += offset;                                                       \
 947         src_base += offset;                                                  \
 948         src_end += offset;                                                   \
 949       }                                                                      \
 950   } while (0)
 951
 952
 953 #define ASSURE_DESTINATION(bytes)                               \
 954   do {                                                          \
 955     if (dst + (bytes) >= dst_end)                               \
 956       {                                                         \
 957         int more_bytes = charbuf_end - charbuf + (bytes);       \
 958                                                                 \
 959         dst = alloc_destination (coding, more_bytes, dst);      \
 960         dst_end = coding->destination + coding->dst_bytes;      \
 961       }                                                         \
 962   } while (0)
 963
 964
 965
 966 static void
 967 coding_set_source (coding)
 968      struct coding_system *coding;
 969 {
 970   if (BUFFERP (coding->src_object))
 971     {
 972       struct buffer *buf = XBUFFER (coding->src_object);
 973
 974       if (coding->src_pos < 0)
 975         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 976       else
 977         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 978     }
 979   else if (STRINGP (coding->src_object))
 980     {
 981       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 982     }
 983   else
 984     /* Otherwise, the source is C string and is never relocated
 985        automatically.  Thus we don't have to update anything.  */
 986     ;
 987 }
 988
 989 static void
 990 coding_set_destination (coding)
 991      struct coding_system *coding;
 992 {
 993   if (BUFFERP (coding->dst_object))
 994     {
 995       if (coding->src_pos < 0)
 996         {
 997           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 998           coding->dst_bytes = (GAP_END_ADDR
 999                                - (coding->src_bytes - coding->consumed)
1000                                - coding->destination);
1001         }
1002       else
1003         {
1004           /* We are sure that coding->dst_pos_byte is before the gap
1005              of the buffer. */
1006           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1007                                  + coding->dst_pos_byte - 1);
1008           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1009                                - coding->destination);
1010         }
1011     }
1012   else
1013     /* Otherwise, the destination is C string and is never relocated
1014        automatically.  Thus we don't have to update anything.  */
1015     ;
1016 }
1017
1018
1019 static void
1020 coding_alloc_by_realloc (coding, bytes)
1021      struct coding_system *coding;
1022      EMACS_INT bytes;
1023 {
1024   coding->destination = (unsigned char *) xrealloc (coding->destination,
1025                                                     coding->dst_bytes + bytes);
1026   coding->dst_bytes += bytes;
1027 }
1028
1029 static void
1030 coding_alloc_by_making_gap (coding, bytes)
1031      struct coding_system *coding;
1032      EMACS_INT bytes;
1033 {
1034   if (BUFFERP (coding->dst_object)
1035       && EQ (coding->src_object, coding->dst_object))
1036     {
1037       EMACS_INT add = coding->src_bytes - coding->consumed;
1038
1039       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1040       make_gap (bytes);
1041       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1042     }
1043   else
1044     {
1045       Lisp_Object this_buffer;
1046
1047       this_buffer = Fcurrent_buffer ();
1048       set_buffer_internal (XBUFFER (coding->dst_object));
1049       make_gap (bytes);
1050       set_buffer_internal (XBUFFER (this_buffer));
1051     }
1052 }
1053
1054
1055 static unsigned char *
1056 alloc_destination (coding, nbytes, dst)
1057      struct coding_system *coding;
1058      EMACS_INT nbytes;
1059      unsigned char *dst;
1060 {
1061   EMACS_INT offset = dst - coding->destination;
1062
1063   if (BUFFERP (coding->dst_object))
1064     coding_alloc_by_making_gap (coding, nbytes);
1065   else
1066     coding_alloc_by_realloc (coding, nbytes);
1067   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1068   coding_set_destination (coding);
1069   dst = coding->destination + offset;
1070   return dst;
1071 }
1072
1073 /** Macros for annotations.  */
1074
1075 /* Maximum length of annotation data (sum of annotations for
1076    composition and charset).  */
1077 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1078
1079 /* An annotation data is stored in the array coding->charbuf in this
1080    format:
1081      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1082    LENGTH is the number of elements in the annotation.
1083    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1084    NCHARS is the number of characters in the text annotated.
1085
1086    The format of the following elements depend on ANNOTATION_MASK.
1087
1088    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1089    follows:
1090      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1091    METHOD is one of enum composition_method.
1092    Optionnal COMPOSITION-COMPONENTS are characters and composition
1093    rules.
1094
1095    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1096    follows.  */
1097
1098 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1099   do {                                                  \
1100     *(buf)++ = -(len);                                  \
1101     *(buf)++ = (mask);                                  \
1102     *(buf)++ = (nchars);                                \
1103     coding->annotated = 1;                              \
1104   } while (0);
1105
1106 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1107   do {                                                                      \
1108     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1109     *buf++ = method;                                                        \
1110   } while (0)
1111
1112
1113 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1114   do {                                                                  \
1115     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1116     *buf++ = id;                                                        \
1117   } while (0)
1118
1119 \f
1120 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1121
1122
1123
1124 \f
1125 /*** 3. UTF-8 ***/
1126
1127 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1128    Check if a text is encoded in UTF-8.  If it is, return 1, else
1129    return 0.  */
1130
1131 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1132 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1133 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1134 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1135 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1136 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1137
1138 static int
1139 detect_coding_utf_8 (coding, detect_info)
1140      struct coding_system *coding;
1141      struct coding_detection_info *detect_info;
1142 {
1143   const unsigned char *src = coding->source, *src_base;
1144   const unsigned char *src_end = coding->source + coding->src_bytes;
1145   int multibytep = coding->src_multibyte;
1146   int consumed_chars = 0;
1147   int found = 0;
1148
1149   detect_info->checked |= CATEGORY_MASK_UTF_8;
1150   /* A coding system of this category is always ASCII compatible.  */
1151   src += coding->head_ascii;
1152
1153   while (1)
1154     {
1155       int c, c1, c2, c3, c4;
1156
1157       src_base = src;
1158       ONE_MORE_BYTE (c);
1159       if (c < 0 || UTF_8_1_OCTET_P (c))
1160         continue;
1161       ONE_MORE_BYTE (c1);
1162       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1163         break;
1164       if (UTF_8_2_OCTET_LEADING_P (c))
1165         {
1166           found = CATEGORY_MASK_UTF_8;
1167           continue;
1168         }
1169       ONE_MORE_BYTE (c2);
1170       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1171         break;
1172       if (UTF_8_3_OCTET_LEADING_P (c))
1173         {
1174           found = CATEGORY_MASK_UTF_8;
1175           continue;
1176         }
1177       ONE_MORE_BYTE (c3);
1178       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1179         break;
1180       if (UTF_8_4_OCTET_LEADING_P (c))
1181         {
1182           found = CATEGORY_MASK_UTF_8;
1183           continue;
1184         }
1185       ONE_MORE_BYTE (c4);
1186       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1187         break;
1188       if (UTF_8_5_OCTET_LEADING_P (c))
1189         {
1190           found = CATEGORY_MASK_UTF_8;
1191           continue;
1192         }
1193       break;
1194     }
1195   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1196   return 0;
1197
1198  no_more_source:
1199   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1200     {
1201       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1202       return 0;
1203     }
1204   detect_info->found |= found;
1205   return 1;
1206 }
1207
1208
1209 static void
1210 decode_coding_utf_8 (coding)
1211      struct coding_system *coding;
1212 {
1213   const unsigned char *src = coding->source + coding->consumed;
1214   const unsigned char *src_end = coding->source + coding->src_bytes;
1215   const unsigned char *src_base;
1216   int *charbuf = coding->charbuf + coding->charbuf_used;
1217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1218   int consumed_chars = 0, consumed_chars_base;
1219   int multibytep = coding->src_multibyte;
1220   Lisp_Object attr, charset_list;
1221
1222   CODING_GET_INFO (coding, attr, charset_list);
1223
1224   while (1)
1225     {
1226       int c, c1, c2, c3, c4, c5;
1227
1228       src_base = src;
1229       consumed_chars_base = consumed_chars;
1230
1231       if (charbuf >= charbuf_end)
1232         break;
1233
1234       ONE_MORE_BYTE (c1);
1235       if (c1 < 0)
1236         {
1237           c = - c1;
1238         }
1239       else if (UTF_8_1_OCTET_P(c1))
1240         {
1241           c = c1;
1242         }
1243       else
1244         {
1245           ONE_MORE_BYTE (c2);
1246           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1247             goto invalid_code;
1248           if (UTF_8_2_OCTET_LEADING_P (c1))
1249             {
1250               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1251               /* Reject overlong sequences here and below.  Encoders
1252                  producing them are incorrect, they can be misleading,
1253                  and they mess up read/write invariance.  */
1254               if (c < 128)
1255                 goto invalid_code;
1256             }
1257           else
1258             {
1259               ONE_MORE_BYTE (c3);
1260               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1261                 goto invalid_code;
1262               if (UTF_8_3_OCTET_LEADING_P (c1))
1263                 {
1264                   c = (((c1 & 0xF) << 12)
1265                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1266                   if (c < 0x800
1267                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1268                     goto invalid_code;
1269                 }
1270               else
1271                 {
1272                   ONE_MORE_BYTE (c4);
1273                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1274                     goto invalid_code;
1275                   if (UTF_8_4_OCTET_LEADING_P (c1))
1276                     {
1277                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1278                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1279                     if (c < 0x10000)
1280                       goto invalid_code;
1281                     }
1282                   else
1283                     {
1284                       ONE_MORE_BYTE (c5);
1285                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1286                         goto invalid_code;
1287                       if (UTF_8_5_OCTET_LEADING_P (c1))
1288                         {
1289                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1290                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1291                                | (c5 & 0x3F));
1292                           if ((c > MAX_CHAR) || (c < 0x200000))
1293                             goto invalid_code;
1294                         }
1295                       else
1296                         goto invalid_code;
1297                     }
1298                 }
1299             }
1300         }
1301
1302       *charbuf++ = c;
1303       continue;
1304
1305     invalid_code:
1306       src = src_base;
1307       consumed_chars = consumed_chars_base;
1308       ONE_MORE_BYTE (c);
1309       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1310       coding->errors++;
1311     }
1312
1313  no_more_source:
1314   coding->consumed_char += consumed_chars_base;
1315   coding->consumed = src_base - coding->source;
1316   coding->charbuf_used = charbuf - coding->charbuf;
1317 }
1318
1319
1320 static int
1321 encode_coding_utf_8 (coding)
1322      struct coding_system *coding;
1323 {
1324   int multibytep = coding->dst_multibyte;
1325   int *charbuf = coding->charbuf;
1326   int *charbuf_end = charbuf + coding->charbuf_used;
1327   unsigned char *dst = coding->destination + coding->produced;
1328   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1329   int produced_chars = 0;
1330   int c;
1331
1332   if (multibytep)
1333     {
1334       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1335
1336       while (charbuf < charbuf_end)
1337         {
1338           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1339
1340           ASSURE_DESTINATION (safe_room);
1341           c = *charbuf++;
1342           if (CHAR_BYTE8_P (c))
1343             {
1344               c = CHAR_TO_BYTE8 (c);
1345               EMIT_ONE_BYTE (c);
1346             }
1347           else
1348             {
1349               CHAR_STRING_ADVANCE (c, pend);
1350               for (p = str; p < pend; p++)
1351                 EMIT_ONE_BYTE (*p);
1352             }
1353         }
1354     }
1355   else
1356     {
1357       int safe_room = MAX_MULTIBYTE_LENGTH;
1358
1359       while (charbuf < charbuf_end)
1360         {
1361           ASSURE_DESTINATION (safe_room);
1362           c = *charbuf++;
1363           if (CHAR_BYTE8_P (c))
1364             *dst++ = CHAR_TO_BYTE8 (c);
1365           else
1366             dst += CHAR_STRING (c, dst);
1367           produced_chars++;
1368         }
1369     }
1370   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1371   coding->produced_char += produced_chars;
1372   coding->produced = dst - coding->destination;
1373   return 0;
1374 }
1375
1376
1377 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1378    Check if a text is encoded in one of UTF-16 based coding systems.
1379    If it is, return 1, else return 0.  */
1380
1381 #define UTF_16_HIGH_SURROGATE_P(val) \
1382   (((val) & 0xFC00) == 0xD800)
1383
1384 #define UTF_16_LOW_SURROGATE_P(val) \
1385   (((val) & 0xFC00) == 0xDC00)
1386
1387 #define UTF_16_INVALID_P(val)   \
1388   (((val) == 0xFFFE)            \
1389    || ((val) == 0xFFFF)         \
1390    || UTF_16_LOW_SURROGATE_P (val))
1391
1392
1393 static int
1394 detect_coding_utf_16 (coding, detect_info)
1395      struct coding_system *coding;
1396      struct coding_detection_info *detect_info;
1397 {
1398   const unsigned char *src = coding->source, *src_base = src;
1399   const unsigned char *src_end = coding->source + coding->src_bytes;
1400   int multibytep = coding->src_multibyte;
1401   int consumed_chars = 0;
1402   int c1, c2;
1403
1404   detect_info->checked |= CATEGORY_MASK_UTF_16;
1405   if (coding->mode & CODING_MODE_LAST_BLOCK
1406       && (coding->src_chars & 1))
1407     {
1408       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1409       return 0;
1410     }
1411
1412   ONE_MORE_BYTE (c1);
1413   ONE_MORE_BYTE (c2);
1414   if ((c1 == 0xFF) && (c2 == 0xFE))
1415     {
1416       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1417                              | CATEGORY_MASK_UTF_16_AUTO);
1418       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1419                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1420                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1421     }
1422   else if ((c1 == 0xFE) && (c2 == 0xFF))
1423     {
1424       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1425                              | CATEGORY_MASK_UTF_16_AUTO);
1426       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1427                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1428                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1429     }
1430   else if (c1 >= 0 && c2 >= 0)
1431     {
1432       detect_info->rejected
1433         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1434     }
1435  no_more_source:
1436   return 1;
1437 }
1438
1439 static void
1440 decode_coding_utf_16 (coding)
1441      struct coding_system *coding;
1442 {
1443   const unsigned char *src = coding->source + coding->consumed;
1444   const unsigned char *src_end = coding->source + coding->src_bytes;
1445   const unsigned char *src_base;
1446   int *charbuf = coding->charbuf + coding->charbuf_used;
1447   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1448   int consumed_chars = 0, consumed_chars_base;
1449   int multibytep = coding->src_multibyte;
1450   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1451   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1452   int surrogate = CODING_UTF_16_SURROGATE (coding);
1453   Lisp_Object attr, charset_list;
1454
1455   CODING_GET_INFO (coding, attr, charset_list);
1456
1457   if (bom == utf_16_with_bom)
1458     {
1459       int c, c1, c2;
1460
1461       src_base = src;
1462       ONE_MORE_BYTE (c1);
1463       ONE_MORE_BYTE (c2);
1464       c = (c1 << 8) | c2;
1465
1466       if (endian == utf_16_big_endian
1467           ? c != 0xFEFF : c != 0xFFFE)
1468         {
1469           /* The first two bytes are not BOM.  Treat them as bytes
1470              for a normal character.  */
1471           src = src_base;
1472           coding->errors++;
1473         }
1474       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1475     }
1476   else if (bom == utf_16_detect_bom)
1477     {
1478       /* We have already tried to detect BOM and failed in
1479          detect_coding.  */
1480       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1481     }
1482
1483   while (1)
1484     {
1485       int c, c1, c2;
1486
1487       src_base = src;
1488       consumed_chars_base = consumed_chars;
1489
1490       if (charbuf + 2 >= charbuf_end)
1491         break;
1492
1493       ONE_MORE_BYTE (c1);
1494       if (c1 < 0)
1495         {
1496           *charbuf++ = -c1;
1497           continue;
1498         }
1499       ONE_MORE_BYTE (c2);
1500       if (c2 < 0)
1501         {
1502           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1503           *charbuf++ = -c2;
1504           continue;
1505         }
1506       c = (endian == utf_16_big_endian
1507            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1508       if (surrogate)
1509         {
1510           if (! UTF_16_LOW_SURROGATE_P (c))
1511             {
1512               if (endian == utf_16_big_endian)
1513                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1514               else
1515                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1516               *charbuf++ = c1;
1517               *charbuf++ = c2;
1518               coding->errors++;
1519               if (UTF_16_HIGH_SURROGATE_P (c))
1520                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1521               else
1522                 *charbuf++ = c;
1523             }
1524           else
1525             {
1526               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1527               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1528               *charbuf++ = 0x10000 + c;
1529             }
1530         }
1531       else
1532         {
1533           if (UTF_16_HIGH_SURROGATE_P (c))
1534             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1535           else
1536             *charbuf++ = c;
1537         }
1538     }
1539
1540  no_more_source:
1541   coding->consumed_char += consumed_chars_base;
1542   coding->consumed = src_base - coding->source;
1543   coding->charbuf_used = charbuf - coding->charbuf;
1544 }
1545
1546 static int
1547 encode_coding_utf_16 (coding)
1548      struct coding_system *coding;
1549 {
1550   int multibytep = coding->dst_multibyte;
1551   int *charbuf = coding->charbuf;
1552   int *charbuf_end = charbuf + coding->charbuf_used;
1553   unsigned char *dst = coding->destination + coding->produced;
1554   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1555   int safe_room = 8;
1556   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1557   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1558   int produced_chars = 0;
1559   Lisp_Object attrs, charset_list;
1560   int c;
1561
1562   CODING_GET_INFO (coding, attrs, charset_list);
1563
1564   if (bom != utf_16_without_bom)
1565     {
1566       ASSURE_DESTINATION (safe_room);
1567       if (big_endian)
1568         EMIT_TWO_BYTES (0xFE, 0xFF);
1569       else
1570         EMIT_TWO_BYTES (0xFF, 0xFE);
1571       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1572     }
1573
1574   while (charbuf < charbuf_end)
1575     {
1576       ASSURE_DESTINATION (safe_room);
1577       c = *charbuf++;
1578       if (c >= MAX_UNICODE_CHAR)
1579         c = coding->default_char;
1580
1581       if (c < 0x10000)
1582         {
1583           if (big_endian)
1584             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1585           else
1586             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1587         }
1588       else
1589         {
1590           int c1, c2;
1591
1592           c -= 0x10000;
1593           c1 = (c >> 10) + 0xD800;
1594           c2 = (c & 0x3FF) + 0xDC00;
1595           if (big_endian)
1596             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1597           else
1598             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1599         }
1600     }
1601   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1602   coding->produced = dst - coding->destination;
1603   coding->produced_char += produced_chars;
1604   return 0;
1605 }
1606
1607 \f
1608 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1609
1610 /* Emacs' internal format for representation of multiple character
1611    sets is a kind of multi-byte encoding, i.e. characters are
1612    represented by variable-length sequences of one-byte codes.
1613
1614    ASCII characters and control characters (e.g. `tab', `newline') are
1615    represented by one-byte sequences which are their ASCII codes, in
1616    the range 0x00 through 0x7F.
1617
1618    8-bit characters of the range 0x80..0x9F are represented by
1619    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1620    code + 0x20).
1621
1622    8-bit characters of the range 0xA0..0xFF are represented by
1623    one-byte sequences which are their 8-bit code.
1624
1625    The other characters are represented by a sequence of `base
1626    leading-code', optional `extended leading-code', and one or two
1627    `position-code's.  The length of the sequence is determined by the
1628    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1629    whereas extended leading-code and position-code take the range 0xA0
1630    through 0xFF.  See `charset.h' for more details about leading-code
1631    and position-code.
1632
1633    --- CODE RANGE of Emacs' internal format ---
1634    character set        range
1635    -------------        -----
1636    ascii                0x00..0x7F
1637    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1638    eight-bit-graphic    0xA0..0xBF
1639    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1640    ---------------------------------------------
1641
1642    As this is the internal character representation, the format is
1643    usually not used externally (i.e. in a file or in a data sent to a
1644    process).  But, it is possible to have a text externally in this
1645    format (i.e. by encoding by the coding system `emacs-mule').
1646
1647    In that case, a sequence of one-byte codes has a slightly different
1648    form.
1649
1650    At first, all characters in eight-bit-control are represented by
1651    one-byte sequences which are their 8-bit code.
1652
1653    Next, character composition data are represented by the byte
1654    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1655    where,
1656         METHOD is 0xF0 plus one of composition method (enum
1657         composition_method),
1658
1659         BYTES is 0xA0 plus a byte length of this composition data,
1660
1661         CHARS is 0x20 plus a number of characters composed by this
1662         data,
1663
1664         COMPONENTs are characters of multibye form or composition
1665         rules encoded by two-byte of ASCII codes.
1666
1667    In addition, for backward compatibility, the following formats are
1668    also recognized as composition data on decoding.
1669
1670    0x80 MSEQ ...
1671    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1672
1673    Here,
1674         MSEQ is a multibyte form but in these special format:
1675           ASCII: 0xA0 ASCII_CODE+0x80,
1676           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1677         RULE is a one byte code of the range 0xA0..0xF0 that
1678         represents a composition rule.
1679   */
1680
1681 char emacs_mule_bytes[256];
1682
1683 int
1684 emacs_mule_char (coding, src, nbytes, nchars, id)
1685      struct coding_system *coding;
1686      const unsigned char *src;
1687      int *nbytes, *nchars, *id;
1688 {
1689   const unsigned char *src_end = coding->source + coding->src_bytes;
1690   const unsigned char *src_base = src;
1691   int multibytep = coding->src_multibyte;
1692   struct charset *charset;
1693   unsigned code;
1694   int c;
1695   int consumed_chars = 0;
1696
1697   ONE_MORE_BYTE (c);
1698   if (c < 0)
1699     {
1700       c = -c;
1701       charset = emacs_mule_charset[0];
1702     }
1703   else
1704     {
1705       switch (emacs_mule_bytes[c])
1706         {
1707         case 2:
1708           if (! (charset = emacs_mule_charset[c]))
1709             goto invalid_code;
1710           ONE_MORE_BYTE (c);
1711           if (c < 0xA0)
1712             goto invalid_code;
1713           code = c & 0x7F;
1714           break;
1715
1716         case 3:
1717           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1718               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1719             {
1720               ONE_MORE_BYTE (c);
1721               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1722                 goto invalid_code;
1723               ONE_MORE_BYTE (c);
1724               if (c < 0xA0)
1725                 goto invalid_code;
1726               code = c & 0x7F;
1727             }
1728           else
1729             {
1730               if (! (charset = emacs_mule_charset[c]))
1731                 goto invalid_code;
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0)
1734                 goto invalid_code;
1735               code = (c & 0x7F) << 8;
1736               ONE_MORE_BYTE (c);
1737               if (c < 0xA0)
1738                 goto invalid_code;
1739               code |= c & 0x7F;
1740             }
1741           break;
1742
1743         case 4:
1744           ONE_MORE_BYTE (c);
1745           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1746             goto invalid_code;
1747           ONE_MORE_BYTE (c);
1748           if (c < 0xA0)
1749             goto invalid_code;
1750           code = (c & 0x7F) << 8;
1751           ONE_MORE_BYTE (c);
1752           if (c < 0xA0)
1753             goto invalid_code;
1754           code |= c & 0x7F;
1755           break;
1756
1757         case 1:
1758           code = c;
1759           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1760                                      ? charset_ascii : charset_eight_bit);
1761           break;
1762
1763         default:
1764           abort ();
1765         }
1766       c = DECODE_CHAR (charset, code);
1767       if (c < 0)
1768         goto invalid_code;
1769     }
1770   *nbytes = src - src_base;
1771   *nchars = consumed_chars;
1772   if (id)
1773     *id = charset->id;
1774   return c;
1775
1776  no_more_source:
1777   return -2;
1778
1779  invalid_code:
1780   return -1;
1781 }
1782
1783
1784 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1785    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1786    else return 0.  */
1787
1788 static int
1789 detect_coding_emacs_mule (coding, detect_info)
1790      struct coding_system *coding;
1791      struct coding_detection_info *detect_info;
1792 {
1793   const unsigned char *src = coding->source, *src_base;
1794   const unsigned char *src_end = coding->source + coding->src_bytes;
1795   int multibytep = coding->src_multibyte;
1796   int consumed_chars = 0;
1797   int c;
1798   int found = 0;
1799
1800   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1801   /* A coding system of this category is always ASCII compatible.  */
1802   src += coding->head_ascii;
1803
1804   while (1)
1805     {
1806       src_base = src;
1807       ONE_MORE_BYTE (c);
1808       if (c < 0)
1809         continue;
1810       if (c == 0x80)
1811         {
1812           /* Perhaps the start of composite character.  We simple skip
1813              it because analyzing it is too heavy for detecting.  But,
1814              at least, we check that the composite character
1815              constitues of more than 4 bytes.  */
1816           const unsigned char *src_base;
1817
1818         repeat:
1819           src_base = src;
1820           do
1821             {
1822               ONE_MORE_BYTE (c);
1823             }
1824           while (c >= 0xA0);
1825
1826           if (src - src_base <= 4)
1827             break;
1828           found = CATEGORY_MASK_EMACS_MULE;
1829           if (c == 0x80)
1830             goto repeat;
1831         }
1832
1833       if (c < 0x80)
1834         {
1835           if (c < 0x20
1836               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1837             break;
1838         }
1839       else
1840         {
1841           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1842
1843           while (more_bytes > 0)
1844             {
1845               ONE_MORE_BYTE (c);
1846               if (c < 0xA0)
1847                 {
1848                   src--;        /* Unread the last byte.  */
1849                   break;
1850                 }
1851               more_bytes--;
1852             }
1853           if (more_bytes != 0)
1854             break;
1855           found = CATEGORY_MASK_EMACS_MULE;
1856         }
1857     }
1858   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1859   return 0;
1860
1861  no_more_source:
1862   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1863     {
1864       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1865       return 0;
1866     }
1867   detect_info->found |= found;
1868   return 1;
1869 }
1870
1871
1872 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1873
1874 /* Decode a character represented as a component of composition
1875    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1876    update SRC to the head of next character (or an encoded composition
1877    rule).  If SRC doesn't points a composition component, set C to -1.
1878    If SRC points an invalid byte sequence, global exit by a return
1879    value 0.  */
1880
1881 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1882   if (1)                                                        \
1883     {                                                           \
1884       int c;                                                    \
1885       int nbytes, nchars;                                       \
1886                                                                 \
1887       if (src == src_end)                                       \
1888         break;                                                  \
1889       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1890       if (c < 0)                                                \
1891         {                                                       \
1892           if (c == -2)                                          \
1893             break;                                              \
1894           goto invalid_code;                                    \
1895         }                                                       \
1896       *buf++ = c;                                               \
1897       src += nbytes;                                            \
1898       consumed_chars += nchars;                                 \
1899     }                                                           \
1900   else
1901
1902
1903 /* Decode a composition rule represented as a component of composition
1904    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1905    and increment BUF.  If SRC points an invalid byte sequence, set C
1906    to -1.  */
1907
1908 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1909   do {                                                  \
1910     int c, gref, nref;                                  \
1911                                                         \
1912     if (src >= src_end)                                 \
1913       goto invalid_code;                                \
1914     ONE_MORE_BYTE_NO_CHECK (c);                         \
1915     c -= 0x20;                                          \
1916     if (c < 0 || c >= 81)                               \
1917       goto invalid_code;                                \
1918                                                         \
1919     gref = c / 9, nref = c % 9;                         \
1920     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1921   } while (0)
1922
1923
1924 /* Decode a composition rule represented as a component of composition
1925    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1926    and increment BUF.  If SRC points an invalid byte sequence, set C
1927    to -1.  */
1928
1929 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1930   do {                                                  \
1931     int gref, nref;                                     \
1932                                                         \
1933     if (src + 1>= src_end)                              \
1934       goto invalid_code;                                \
1935     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1936     gref -= 0x20;                                       \
1937     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1938     nref -= 0x20;                                       \
1939     if (gref < 0 || gref >= 81                          \
1940         || nref < 0 || nref >= 81)                      \
1941       goto invalid_code;                                \
1942     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1943   } while (0)
1944
1945
1946 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1947   do {                                                                  \
1948     /* Emacs 21 style format.  The first three bytes at SRC are         \
1949        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1950        the byte length of this composition information, CHARS is the    \
1951        number of characters composed by this composition.  */           \
1952     enum composition_method method = c - 0xF2;                          \
1953     int *charbuf_base = charbuf;                                        \
1954     int consumed_chars_limit;                                           \
1955     int nbytes, nchars;                                                 \
1956                                                                         \
1957     ONE_MORE_BYTE (c);                                                  \
1958     if (c < 0)                                                          \
1959       goto invalid_code;                                                \
1960     nbytes = c - 0xA0;                                                  \
1961     if (nbytes < 3)                                                     \
1962       goto invalid_code;                                                \
1963     ONE_MORE_BYTE (c);                                                  \
1964     if (c < 0)                                                          \
1965       goto invalid_code;                                                \
1966     nchars = c - 0xA0;                                                  \
1967     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1968     consumed_chars_limit = consumed_chars_base + nbytes;                \
1969     if (method != COMPOSITION_RELATIVE)                                 \
1970       {                                                                 \
1971         int i = 0;                                                      \
1972         while (consumed_chars < consumed_chars_limit)                   \
1973           {                                                             \
1974             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1975               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1976             else                                                        \
1977               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1978             i++;                                                        \
1979           }                                                             \
1980         if (consumed_chars < consumed_chars_limit)                      \
1981           goto invalid_code;                                            \
1982         charbuf_base[0] -= i;                                           \
1983       }                                                                 \
1984   } while (0)
1985
1986
1987 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1988   do {                                                          \
1989     /* Emacs 20 style format for relative composition.  */      \
1990     /* Store multibyte form of characters to be composed.  */   \
1991     enum composition_method method = COMPOSITION_RELATIVE;      \
1992     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1993     int *buf = components;                                      \
1994     int i, j;                                                   \
1995                                                                 \
1996     src = src_base;                                             \
1997     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1998     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1999       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
2000     if (i < 2)                                                  \
2001       goto invalid_code;                                        \
2002     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2003     for (j = 0; j < i; j++)                                     \
2004       *charbuf++ = components[j];                               \
2005   } while (0)
2006
2007
2008 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2009   do {                                                          \
2010     /* Emacs 20 style format for rule-base composition.  */     \
2011     /* Store multibyte form of characters to be composed.  */   \
2012     enum composition_method method = COMPOSITION_WITH_RULE;     \
2013     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2014     int *buf = components;                                      \
2015     int i, j;                                                   \
2016                                                                 \
2017     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2018     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2019       {                                                         \
2020         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2021         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2022       }                                                         \
2023     if (i < 1 || (buf - components) % 2 == 0)                   \
2024       goto invalid_code;                                        \
2025     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2026       goto no_more_source;                                      \
2027     ADD_COMPOSITION_DATA (buf, i, method);                      \
2028     for (j = 0; j < i; j++)                                     \
2029       *charbuf++ = components[j];                               \
2030     for (j = 0; j < i; j += 2)                                  \
2031       *charbuf++ = components[j];                               \
2032   } while (0)
2033
2034
2035 static void
2036 decode_coding_emacs_mule (coding)
2037      struct coding_system *coding;
2038 {
2039   const unsigned char *src = coding->source + coding->consumed;
2040   const unsigned char *src_end = coding->source + coding->src_bytes;
2041   const unsigned char *src_base;
2042   int *charbuf = coding->charbuf + coding->charbuf_used;
2043   int *charbuf_end
2044     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2045   int consumed_chars = 0, consumed_chars_base;
2046   int multibytep = coding->src_multibyte;
2047   Lisp_Object attrs, charset_list;
2048   int char_offset = coding->produced_char;
2049   int last_offset = char_offset;
2050   int last_id = charset_ascii;
2051
2052   CODING_GET_INFO (coding, attrs, charset_list);
2053
2054   while (1)
2055     {
2056       int c;
2057
2058       src_base = src;
2059       consumed_chars_base = consumed_chars;
2060
2061       if (charbuf >= charbuf_end)
2062         break;
2063
2064       ONE_MORE_BYTE (c);
2065       if (c < 0)
2066         {
2067           *charbuf++ = -c;
2068           char_offset++;
2069         }
2070       else if (c < 0x80)
2071         {
2072           *charbuf++ = c;
2073           char_offset++;
2074         }
2075       else if (c == 0x80)
2076         {
2077           ONE_MORE_BYTE (c);
2078           if (c < 0)
2079             goto invalid_code;
2080           if (c - 0xF2 >= COMPOSITION_RELATIVE
2081               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2082             DECODE_EMACS_MULE_21_COMPOSITION (c);
2083           else if (c < 0xC0)
2084             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2085           else if (c == 0xFF)
2086             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2087           else
2088             goto invalid_code;
2089         }
2090       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2091         {
2092           int nbytes, nchars;
2093           int id;
2094
2095           src = src_base;
2096           consumed_chars = consumed_chars_base;
2097           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2098           if (c < 0)
2099             {
2100               if (c == -2)
2101                 break;
2102               goto invalid_code;
2103             }
2104           if (last_id != id)
2105             {
2106               if (last_id != charset_ascii)
2107                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2108               last_id = id;
2109               last_offset = char_offset;
2110             }
2111           *charbuf++ = c;
2112           src += nbytes;
2113           consumed_chars += nchars;
2114           char_offset++;
2115         }
2116       continue;
2117
2118     invalid_code:
2119       src = src_base;
2120       consumed_chars = consumed_chars_base;
2121       ONE_MORE_BYTE (c);
2122       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2123       char_offset++;
2124       coding->errors++;
2125     }
2126
2127  no_more_source:
2128   if (last_id != charset_ascii)
2129     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2130   coding->consumed_char += consumed_chars_base;
2131   coding->consumed = src_base - coding->source;
2132   coding->charbuf_used = charbuf - coding->charbuf;
2133 }
2134
2135
2136 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2137   do {                                          \
2138     if (id < 0xA0)                              \
2139       codes[0] = id, codes[1] = 0;              \
2140     else if (id < 0xE0)                         \
2141       codes[0] = 0x9A, codes[1] = id;           \
2142     else if (id < 0xF0)                         \
2143       codes[0] = 0x9B, codes[1] = id;           \
2144     else if (id < 0xF5)                         \
2145       codes[0] = 0x9C, codes[1] = id;           \
2146     else                                        \
2147       codes[0] = 0x9D, codes[1] = id;           \
2148   } while (0);
2149
2150
2151 static int
2152 encode_coding_emacs_mule (coding)
2153      struct coding_system *coding;
2154 {
2155   int multibytep = coding->dst_multibyte;
2156   int *charbuf = coding->charbuf;
2157   int *charbuf_end = charbuf + coding->charbuf_used;
2158   unsigned char *dst = coding->destination + coding->produced;
2159   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2160   int safe_room = 8;
2161   int produced_chars = 0;
2162   Lisp_Object attrs, charset_list;
2163   int c;
2164   int preferred_charset_id = -1;
2165
2166   CODING_GET_INFO (coding, attrs, charset_list);
2167   if (! EQ (charset_list, Vemacs_mule_charset_list))
2168     {
2169       CODING_ATTR_CHARSET_LIST (attrs)
2170         = charset_list = Vemacs_mule_charset_list;
2171     }
2172
2173   while (charbuf < charbuf_end)
2174     {
2175       ASSURE_DESTINATION (safe_room);
2176       c = *charbuf++;
2177
2178       if (c < 0)
2179         {
2180           /* Handle an annotation.  */
2181           switch (*charbuf)
2182             {
2183             case CODING_ANNOTATE_COMPOSITION_MASK:
2184               /* Not yet implemented.  */
2185               break;
2186             case CODING_ANNOTATE_CHARSET_MASK:
2187               preferred_charset_id = charbuf[3];
2188               if (preferred_charset_id >= 0
2189                   && NILP (Fmemq (make_number (preferred_charset_id),
2190                                   charset_list)))
2191                 preferred_charset_id = -1;
2192               break;
2193             default:
2194               abort ();
2195             }
2196           charbuf += -c - 1;
2197           continue;
2198         }
2199
2200       if (ASCII_CHAR_P (c))
2201         EMIT_ONE_ASCII_BYTE (c);
2202       else if (CHAR_BYTE8_P (c))
2203         {
2204           c = CHAR_TO_BYTE8 (c);
2205           EMIT_ONE_BYTE (c);
2206         }
2207       else
2208         {
2209           struct charset *charset;
2210           unsigned code;
2211           int dimension;
2212           int emacs_mule_id;
2213           unsigned char leading_codes[2];
2214
2215           if (preferred_charset_id >= 0)
2216             {
2217               charset = CHARSET_FROM_ID (preferred_charset_id);
2218               if (! CHAR_CHARSET_P (c, charset))
2219                 charset = char_charset (c, charset_list, NULL);
2220             }
2221           else
2222             charset = char_charset (c, charset_list, &code);
2223           if (! charset)
2224             {
2225               c = coding->default_char;
2226               if (ASCII_CHAR_P (c))
2227                 {
2228                   EMIT_ONE_ASCII_BYTE (c);
2229                   continue;
2230                 }
2231               charset = char_charset (c, charset_list, &code);
2232             }
2233           dimension = CHARSET_DIMENSION (charset);
2234           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2235           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2236           EMIT_ONE_BYTE (leading_codes[0]);
2237           if (leading_codes[1])
2238             EMIT_ONE_BYTE (leading_codes[1]);
2239           if (dimension == 1)
2240             EMIT_ONE_BYTE (code | 0x80);
2241           else
2242             {
2243               code |= 0x8080;
2244               EMIT_ONE_BYTE (code >> 8);
2245               EMIT_ONE_BYTE (code & 0xFF);
2246             }
2247         }
2248     }
2249   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2250   coding->produced_char += produced_chars;
2251   coding->produced = dst - coding->destination;
2252   return 0;
2253 }
2254
2255 \f
2256 /*** 7. ISO2022 handlers ***/
2257
2258 /* The following note describes the coding system ISO2022 briefly.
2259    Since the intention of this note is to help understand the
2260    functions in this file, some parts are NOT ACCURATE or are OVERLY
2261    SIMPLIFIED.  For thorough understanding, please refer to the
2262    original document of ISO2022.  This is equivalent to the standard
2263    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2264
2265    ISO2022 provides many mechanisms to encode several character sets
2266    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2267    is encoded using bytes less than 128.  This may make the encoded
2268    text a little bit longer, but the text passes more easily through
2269    several types of gateway, some of which strip off the MSB (Most
2270    Significant Bit).
2271
2272    There are two kinds of character sets: control character sets and
2273    graphic character sets.  The former contain control characters such
2274    as `newline' and `escape' to provide control functions (control
2275    functions are also provided by escape sequences).  The latter
2276    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2277    two control character sets and many graphic character sets.
2278
2279    Graphic character sets are classified into one of the following
2280    four classes, according to the number of bytes (DIMENSION) and
2281    number of characters in one dimension (CHARS) of the set:
2282    - DIMENSION1_CHARS94
2283    - DIMENSION1_CHARS96
2284    - DIMENSION2_CHARS94
2285    - DIMENSION2_CHARS96
2286
2287    In addition, each character set is assigned an identification tag,
2288    unique for each set, called the "final character" (denoted as <F>
2289    hereafter).  The <F> of each character set is decided by ECMA(*)
2290    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2291    (0x30..0x3F are for private use only).
2292
2293    Note (*): ECMA = European Computer Manufacturers Association
2294
2295    Here are examples of graphic character sets [NAME(<F>)]:
2296         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2297         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2298         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2299         o DIMENSION2_CHARS96 -- none for the moment
2300
2301    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2302         C0 [0x00..0x1F] -- control character plane 0
2303         GL [0x20..0x7F] -- graphic character plane 0
2304         C1 [0x80..0x9F] -- control character plane 1
2305         GR [0xA0..0xFF] -- graphic character plane 1
2306
2307    A control character set is directly designated and invoked to C0 or
2308    C1 by an escape sequence.  The most common case is that:
2309    - ISO646's  control character set is designated/invoked to C0, and
2310    - ISO6429's control character set is designated/invoked to C1,
2311    and usually these designations/invocations are omitted in encoded
2312    text.  In a 7-bit environment, only C0 can be used, and a control
2313    character for C1 is encoded by an appropriate escape sequence to
2314    fit into the environment.  All control characters for C1 are
2315    defined to have corresponding escape sequences.
2316
2317    A graphic character set is at first designated to one of four
2318    graphic registers (G0 through G3), then these graphic registers are
2319    invoked to GL or GR.  These designations and invocations can be
2320    done independently.  The most common case is that G0 is invoked to
2321    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2322    these invocations and designations are omitted in encoded text.
2323    In a 7-bit environment, only GL can be used.
2324
2325    When a graphic character set of CHARS94 is invoked to GL, codes
2326    0x20 and 0x7F of the GL area work as control characters SPACE and
2327    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2328    be used.
2329
2330    There are two ways of invocation: locking-shift and single-shift.
2331    With locking-shift, the invocation lasts until the next different
2332    invocation, whereas with single-shift, the invocation affects the
2333    following character only and doesn't affect the locking-shift
2334    state.  Invocations are done by the following control characters or
2335    escape sequences:
2336
2337    ----------------------------------------------------------------------
2338    abbrev  function                  cntrl escape seq   description
2339    ----------------------------------------------------------------------
2340    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2341    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2342    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2343    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2344    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2345    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2346    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2347    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2348    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2349    ----------------------------------------------------------------------
2350    (*) These are not used by any known coding system.
2351
2352    Control characters for these functions are defined by macros
2353    ISO_CODE_XXX in `coding.h'.
2354
2355    Designations are done by the following escape sequences:
2356    ----------------------------------------------------------------------
2357    escape sequence      description
2358    ----------------------------------------------------------------------
2359    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2360    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2361    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2362    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2363    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2364    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2365    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2366    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2367    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2368    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2369    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2370    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2371    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2372    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2373    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2374    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2375    ----------------------------------------------------------------------
2376
2377    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2378    of dimension 1, chars 94, and final character <F>, etc...
2379
2380    Note (*): Although these designations are not allowed in ISO2022,
2381    Emacs accepts them on decoding, and produces them on encoding
2382    CHARS96 character sets in a coding system which is characterized as
2383    7-bit environment, non-locking-shift, and non-single-shift.
2384
2385    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2386    '(' must be omitted.  We refer to this as "short-form" hereafter.
2387
2388    Now you may notice that there are a lot of ways of encoding the
2389    same multilingual text in ISO2022.  Actually, there exist many
2390    coding systems such as Compound Text (used in X11's inter client
2391    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2392    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2393    localized platforms), and all of these are variants of ISO2022.
2394
2395    In addition to the above, Emacs handles two more kinds of escape
2396    sequences: ISO6429's direction specification and Emacs' private
2397    sequence for specifying character composition.
2398
2399    ISO6429's direction specification takes the following form:
2400         o CSI ']'      -- end of the current direction
2401         o CSI '0' ']'  -- end of the current direction
2402         o CSI '1' ']'  -- start of left-to-right text
2403         o CSI '2' ']'  -- start of right-to-left text
2404    The control character CSI (0x9B: control sequence introducer) is
2405    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2406
2407    Character composition specification takes the following form:
2408         o ESC '0' -- start relative composition
2409         o ESC '1' -- end composition
2410         o ESC '2' -- start rule-base composition (*)
2411         o ESC '3' -- start relative composition with alternate chars  (**)
2412         o ESC '4' -- start rule-base composition with alternate chars  (**)
2413   Since these are not standard escape sequences of any ISO standard,
2414   the use of them with these meanings is restricted to Emacs only.
2415
2416   (*) This form is used only in Emacs 20.7 and older versions,
2417   but newer versions can safely decode it.
2418   (**) This form is used only in Emacs 21.1 and newer versions,
2419   and older versions can't decode it.
2420
2421   Here's a list of example usages of these composition escape
2422   sequences (categorized by `enum composition_method').
2423
2424   COMPOSITION_RELATIVE:
2425         ESC 0 CHAR [ CHAR ] ESC 1
2426   COMPOSITION_WITH_RULE:
2427         ESC 2 CHAR [ RULE CHAR ] ESC 1
2428   COMPOSITION_WITH_ALTCHARS:
2429         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2430   COMPOSITION_WITH_RULE_ALTCHARS:
2431         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2432
2433 enum iso_code_class_type iso_code_class[256];
2434
2435 #define SAFE_CHARSET_P(coding, id)      \
2436   ((id) <= (coding)->max_charset_id     \
2437    && (coding)->safe_charsets[id] >= 0)
2438
2439
2440 #define SHIFT_OUT_OK(category)  \
2441   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2442
2443 static void
2444 setup_iso_safe_charsets (attrs)
2445      Lisp_Object attrs;
2446 {
2447   Lisp_Object charset_list, safe_charsets;
2448   Lisp_Object request;
2449   Lisp_Object reg_usage;
2450   Lisp_Object tail;
2451   int reg94, reg96;
2452   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2453   int max_charset_id;
2454
2455   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2456   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2457       && ! EQ (charset_list, Viso_2022_charset_list))
2458     {
2459       CODING_ATTR_CHARSET_LIST (attrs)
2460         = charset_list = Viso_2022_charset_list;
2461       ASET (attrs, coding_attr_safe_charsets, Qnil);
2462     }
2463
2464   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2465     return;
2466
2467   max_charset_id = 0;
2468   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2469     {
2470       int id = XINT (XCAR (tail));
2471       if (max_charset_id < id)
2472         max_charset_id = id;
2473     }
2474
2475   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2476                                 make_number (255));
2477   request = AREF (attrs, coding_attr_iso_request);
2478   reg_usage = AREF (attrs, coding_attr_iso_usage);
2479   reg94 = XINT (XCAR (reg_usage));
2480   reg96 = XINT (XCDR (reg_usage));
2481
2482   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2483     {
2484       Lisp_Object id;
2485       Lisp_Object reg;
2486       struct charset *charset;
2487
2488       id = XCAR (tail);
2489       charset = CHARSET_FROM_ID (XINT (id));
2490       reg = Fcdr (Fassq (id, request));
2491       if (! NILP (reg))
2492         SSET (safe_charsets, XINT (id), XINT (reg));
2493       else if (charset->iso_chars_96)
2494         {
2495           if (reg96 < 4)
2496             SSET (safe_charsets, XINT (id), reg96);
2497         }
2498       else
2499         {
2500           if (reg94 < 4)
2501             SSET (safe_charsets, XINT (id), reg94);
2502         }
2503     }
2504   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2505 }
2506
2507
2508 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2509    Check if a text is encoded in one of ISO-2022 based codig systems.
2510    If it is, return 1, else return 0.  */
2511
2512 static int
2513 detect_coding_iso_2022 (coding, detect_info)
2514      struct coding_system *coding;
2515      struct coding_detection_info *detect_info;
2516 {
2517   const unsigned char *src = coding->source, *src_base = src;
2518   const unsigned char *src_end = coding->source + coding->src_bytes;
2519   int multibytep = coding->src_multibyte;
2520   int single_shifting = 0;
2521   int id;
2522   int c, c1;
2523   int consumed_chars = 0;
2524   int i;
2525   int rejected = 0;
2526   int found = 0;
2527
2528   detect_info->checked |= CATEGORY_MASK_ISO;
2529
2530   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2531     {
2532       struct coding_system *this = &(coding_categories[i]);
2533       Lisp_Object attrs, val;
2534
2535       attrs = CODING_ID_ATTRS (this->id);
2536       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2537           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2538         setup_iso_safe_charsets (attrs);
2539       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2540       this->max_charset_id = SCHARS (val) - 1;
2541       this->safe_charsets = (char *) SDATA (val);
2542     }
2543
2544   /* A coding system of this category is always ASCII compatible.  */
2545   src += coding->head_ascii;
2546
2547   while (rejected != CATEGORY_MASK_ISO)
2548     {
2549       src_base = src;
2550       ONE_MORE_BYTE (c);
2551       switch (c)
2552         {
2553         case ISO_CODE_ESC:
2554           if (inhibit_iso_escape_detection)
2555             break;
2556           single_shifting = 0;
2557           ONE_MORE_BYTE (c);
2558           if (c >= '(' && c <= '/')
2559             {
2560               /* Designation sequence for a charset of dimension 1.  */
2561               ONE_MORE_BYTE (c1);
2562               if (c1 < ' ' || c1 >= 0x80
2563                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2564                 /* Invalid designation sequence.  Just ignore.  */
2565                 break;
2566             }
2567           else if (c == '$')
2568             {
2569               /* Designation sequence for a charset of dimension 2.  */
2570               ONE_MORE_BYTE (c);
2571               if (c >= '@' && c <= 'B')
2572                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2573                 id = iso_charset_table[1][0][c];
2574               else if (c >= '(' && c <= '/')
2575                 {
2576                   ONE_MORE_BYTE (c1);
2577                   if (c1 < ' ' || c1 >= 0x80
2578                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2579                     /* Invalid designation sequence.  Just ignore.  */
2580                     break;
2581                 }
2582               else
2583                 /* Invalid designation sequence.  Just ignore it.  */
2584                 break;
2585             }
2586           else if (c == 'N' || c == 'O')
2587             {
2588               /* ESC <Fe> for SS2 or SS3.  */
2589               single_shifting = 1;
2590               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2591               break;
2592             }
2593           else if (c >= '0' && c <= '4')
2594             {
2595               /* ESC <Fp> for start/end composition.  */
2596               found |= CATEGORY_MASK_ISO;
2597               break;
2598             }
2599           else
2600             {
2601               /* Invalid escape sequence.  Just ignore it.  */
2602               break;
2603             }
2604
2605           /* We found a valid designation sequence for CHARSET.  */
2606           rejected |= CATEGORY_MASK_ISO_8BIT;
2607           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2608                               id))
2609             found |= CATEGORY_MASK_ISO_7;
2610           else
2611             rejected |= CATEGORY_MASK_ISO_7;
2612           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2613                               id))
2614             found |= CATEGORY_MASK_ISO_7_TIGHT;
2615           else
2616             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2617           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2618                               id))
2619             found |= CATEGORY_MASK_ISO_7_ELSE;
2620           else
2621             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2622           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2623                               id))
2624             found |= CATEGORY_MASK_ISO_8_ELSE;
2625           else
2626             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2627           break;
2628
2629         case ISO_CODE_SO:
2630         case ISO_CODE_SI:
2631           /* Locking shift out/in.  */
2632           if (inhibit_iso_escape_detection)
2633             break;
2634           single_shifting = 0;
2635           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2636           found |= CATEGORY_MASK_ISO_ELSE;
2637           break;
2638
2639         case ISO_CODE_CSI:
2640           /* Control sequence introducer.  */
2641           single_shifting = 0;
2642           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2643           found |= CATEGORY_MASK_ISO_8_ELSE;
2644           goto check_extra_latin;
2645
2646         case ISO_CODE_SS2:
2647         case ISO_CODE_SS3:
2648           /* Single shift.   */
2649           if (inhibit_iso_escape_detection)
2650             break;
2651           single_shifting = 0;
2652           rejected |= CATEGORY_MASK_ISO_7BIT;
2653           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2654               & CODING_ISO_FLAG_SINGLE_SHIFT)
2655             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2656           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2657               & CODING_ISO_FLAG_SINGLE_SHIFT)
2658             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2659           if (single_shifting)
2660             break;
2661           goto check_extra_latin;
2662
2663         default:
2664           if (c < 0)
2665             continue;
2666           if (c < 0x80)
2667             {
2668               single_shifting = 0;
2669               break;
2670             }
2671           if (c >= 0xA0)
2672             {
2673               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2674               found |= CATEGORY_MASK_ISO_8_1;
2675               /* Check the length of succeeding codes of the range
2676                  0xA0..0FF.  If the byte length is even, we include
2677                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2678                  only when we are not single shifting.  */
2679               if (! single_shifting
2680                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2681                 {
2682                   int i = 1;
2683                   while (src < src_end)
2684                     {
2685                       ONE_MORE_BYTE (c);
2686                       if (c < 0xA0)
2687                         break;
2688                       i++;
2689                     }
2690
2691                   if (i & 1 && src < src_end)
2692                     rejected |= CATEGORY_MASK_ISO_8_2;
2693                   else
2694                     found |= CATEGORY_MASK_ISO_8_2;
2695                 }
2696               break;
2697             }
2698         check_extra_latin:
2699           single_shifting = 0;
2700           if (! VECTORP (Vlatin_extra_code_table)
2701               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2702             {
2703               rejected = CATEGORY_MASK_ISO;
2704               break;
2705             }
2706           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2707               & CODING_ISO_FLAG_LATIN_EXTRA)
2708             found |= CATEGORY_MASK_ISO_8_1;
2709           else
2710             rejected |= CATEGORY_MASK_ISO_8_1;
2711           rejected |= CATEGORY_MASK_ISO_8_2;
2712         }
2713     }
2714   detect_info->rejected |= CATEGORY_MASK_ISO;
2715   return 0;
2716
2717  no_more_source:
2718   detect_info->rejected |= rejected;
2719   detect_info->found |= (found & ~rejected);
2720   return 1;
2721 }
2722
2723
2724 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2725    escape sequence should be kept.  */
2726 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2727   do {                                                                  \
2728     int id, prev;                                                       \
2729                                                                         \
2730     if (final < '0' || final >= 128                                     \
2731         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2732         || !SAFE_CHARSET_P (coding, id))                                \
2733       {                                                                 \
2734         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2735         chars_96 = -1;                                                  \
2736         break;                                                          \
2737       }                                                                 \
2738     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2739     if (id == charset_jisx0201_roman)                                   \
2740       {                                                                 \
2741         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2742           id = charset_ascii;                                           \
2743       }                                                                 \
2744     else if (id == charset_jisx0208_1978)                               \
2745       {                                                                 \
2746         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2747           id = charset_jisx0208;                                        \
2748       }                                                                 \
2749     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2750     /* If there was an invalid designation to REG previously, and this  \
2751        designation is ASCII to REG, we should keep this designation     \
2752        sequence.  */                                                    \
2753     if (prev == -2 && id == charset_ascii)                              \
2754       chars_96 = -1;                                                    \
2755   } while (0)
2756
2757
2758 #define MAYBE_FINISH_COMPOSITION()                              \
2759   do {                                                          \
2760     int i;                                                      \
2761     if (composition_state == COMPOSING_NO)                      \
2762       break;                                                    \
2763     /* It is assured that we have enough room for producing     \
2764        characters stored in the table `components'.  */         \
2765     if (charbuf + component_idx > charbuf_end)                  \
2766       goto no_more_source;                                      \
2767     composition_state = COMPOSING_NO;                           \
2768     if (method == COMPOSITION_RELATIVE                          \
2769         || method == COMPOSITION_WITH_ALTCHARS)                 \
2770       {                                                         \
2771         for (i = 0; i < component_idx; i++)                     \
2772           *charbuf++ = components[i];                           \
2773         char_offset += component_idx;                           \
2774       }                                                         \
2775     else                                                        \
2776       {                                                         \
2777         for (i = 0; i < component_idx; i += 2)                  \
2778           *charbuf++ = components[i];                           \
2779         char_offset += (component_idx / 2) + 1;                 \
2780       }                                                         \
2781   } while (0)
2782
2783
2784 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2785    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2786    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2787    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2788    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2789   */
2790
2791 #define DECODE_COMPOSITION_START(c1)                                    \
2792   do {                                                                  \
2793     if (c1 == '0'                                                       \
2794         && composition_state == COMPOSING_COMPONENT_RULE)               \
2795       {                                                                 \
2796         component_len = component_idx;                                  \
2797         composition_state = COMPOSING_CHAR;                             \
2798       }                                                                 \
2799     else                                                                \
2800       {                                                                 \
2801         const unsigned char *p;                                         \
2802                                                                         \
2803         MAYBE_FINISH_COMPOSITION ();                                    \
2804         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2805           goto no_more_source;                                          \
2806         for (p = src; p < src_end - 1; p++)                             \
2807           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2808             break;                                                      \
2809         if (p == src_end - 1)                                           \
2810           {                                                             \
2811             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2812               goto invalid_code;                                        \
2813             goto no_more_source;                                        \
2814           }                                                             \
2815                                                                         \
2816         /* This is surely the start of a composition.  */               \
2817         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2818                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2819                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2820                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2821         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2822                              : COMPOSING_COMPONENT_CHAR);               \
2823         component_idx = component_len = 0;                              \
2824       }                                                                 \
2825   } while (0)
2826
2827
2828 /* Handle compositoin end sequence ESC 1.  */
2829
2830 #define DECODE_COMPOSITION_END()                                        \
2831   do {                                                                  \
2832     int nchars = (component_len > 0 ? component_idx - component_len     \
2833                   : method == COMPOSITION_RELATIVE ? component_idx      \
2834                   : (component_idx + 1) / 2);                           \
2835     int i;                                                              \
2836     int *saved_charbuf = charbuf;                                       \
2837                                                                         \
2838     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2839     if (method != COMPOSITION_RELATIVE)                                 \
2840       {                                                                 \
2841         if (component_len == 0)                                         \
2842           for (i = 0; i < component_idx; i++)                           \
2843             *charbuf++ = components[i];                                 \
2844         else                                                            \
2845           for (i = 0; i < component_len; i++)                           \
2846             *charbuf++ = components[i];                                 \
2847         *saved_charbuf = saved_charbuf - charbuf;                       \
2848       }                                                                 \
2849     if (method == COMPOSITION_WITH_RULE)                                \
2850       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2851         *charbuf++ = components[i];                                     \
2852     else                                                                \
2853       for (i = component_len; i < component_idx; i++, char_offset++)    \
2854         *charbuf++ = components[i];                                     \
2855     coding->annotated = 1;                                              \
2856     composition_state = COMPOSING_NO;                                   \
2857   } while (0)
2858
2859
2860 /* Decode a composition rule from the byte C1 (and maybe one more byte
2861    from SRC) and store one encoded composition rule in
2862    coding->cmp_data.  */
2863
2864 #define DECODE_COMPOSITION_RULE(c1)                                     \
2865   do {                                                                  \
2866     (c1) -= 32;                                                         \
2867     if (c1 < 81)                /* old format (before ver.21) */        \
2868       {                                                                 \
2869         int gref = (c1) / 9;                                            \
2870         int nref = (c1) % 9;                                            \
2871         if (gref == 4) gref = 10;                                       \
2872         if (nref == 4) nref = 10;                                       \
2873         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2874       }                                                                 \
2875     else if (c1 < 93)           /* new format (after ver.21) */         \
2876       {                                                                 \
2877         ONE_MORE_BYTE (c2);                                             \
2878         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2879       }                                                                 \
2880     else                                                                \
2881       c1 = 0;                                                           \
2882   } while (0)
2883
2884
2885 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2886
2887 static void
2888 decode_coding_iso_2022 (coding)
2889      struct coding_system *coding;
2890 {
2891   const unsigned char *src = coding->source + coding->consumed;
2892   const unsigned char *src_end = coding->source + coding->src_bytes;
2893   const unsigned char *src_base;
2894   int *charbuf = coding->charbuf + coding->charbuf_used;
2895   int *charbuf_end
2896     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2897   int consumed_chars = 0, consumed_chars_base;
2898   int multibytep = coding->src_multibyte;
2899   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2900   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2901   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2902   int charset_id_2, charset_id_3;
2903   struct charset *charset;
2904   int c;
2905   /* For handling composition sequence.  */
2906 #define COMPOSING_NO                    0
2907 #define COMPOSING_CHAR                  1
2908 #define COMPOSING_RULE                  2
2909 #define COMPOSING_COMPONENT_CHAR        3
2910 #define COMPOSING_COMPONENT_RULE        4
2911
2912   int composition_state = COMPOSING_NO;
2913   enum composition_method method;
2914   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2915   int component_idx;
2916   int component_len;
2917   Lisp_Object attrs, charset_list;
2918   int char_offset = coding->produced_char;
2919   int last_offset = char_offset;
2920   int last_id = charset_ascii;
2921
2922   CODING_GET_INFO (coding, attrs, charset_list);
2923   setup_iso_safe_charsets (attrs);
2924
2925   while (1)
2926     {
2927       int c1, c2;
2928
2929       src_base = src;
2930       consumed_chars_base = consumed_chars;
2931
2932       if (charbuf >= charbuf_end)
2933         break;
2934
2935       ONE_MORE_BYTE (c1);
2936       if (c1 < 0)
2937         goto invalid_code;
2938
2939       /* We produce at most one character.  */
2940       switch (iso_code_class [c1])
2941         {
2942         case ISO_0x20_or_0x7F:
2943           if (composition_state != COMPOSING_NO)
2944             {
2945               if (composition_state == COMPOSING_RULE
2946                   || composition_state == COMPOSING_COMPONENT_RULE)
2947                 {
2948                   DECODE_COMPOSITION_RULE (c1);
2949                   components[component_idx++] = c1;
2950                   composition_state--;
2951                   continue;
2952                 }
2953             }
2954           if (charset_id_0 < 0
2955               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2956             /* This is SPACE or DEL.  */
2957             charset = CHARSET_FROM_ID (charset_ascii);
2958           else
2959             charset = CHARSET_FROM_ID (charset_id_0);
2960           break;
2961
2962         case ISO_graphic_plane_0:
2963           if (composition_state != COMPOSING_NO)
2964             {
2965               if (composition_state == COMPOSING_RULE
2966                   || composition_state == COMPOSING_COMPONENT_RULE)
2967                 {
2968                   DECODE_COMPOSITION_RULE (c1);
2969                   components[component_idx++] = c1;
2970                   composition_state--;
2971                   continue;
2972                 }
2973             }
2974           if (charset_id_0 < 0)
2975             charset = CHARSET_FROM_ID (charset_ascii);
2976           else
2977             charset = CHARSET_FROM_ID (charset_id_0);
2978           break;
2979
2980         case ISO_0xA0_or_0xFF:
2981           if (charset_id_1 < 0
2982               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2983               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2984             goto invalid_code;
2985           /* This is a graphic character, we fall down ... */
2986
2987         case ISO_graphic_plane_1:
2988           if (charset_id_1 < 0)
2989             goto invalid_code;
2990           charset = CHARSET_FROM_ID (charset_id_1);
2991           break;
2992
2993         case ISO_control_0:
2994           MAYBE_FINISH_COMPOSITION ();
2995           charset = CHARSET_FROM_ID (charset_ascii);
2996           break;
2997
2998         case ISO_control_1:
2999           MAYBE_FINISH_COMPOSITION ();
3000           goto invalid_code;
3001
3002         case ISO_shift_out:
3003           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3004               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3005             goto invalid_code;
3006           CODING_ISO_INVOCATION (coding, 0) = 1;
3007           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3008           continue;
3009
3010         case ISO_shift_in:
3011           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3012             goto invalid_code;
3013           CODING_ISO_INVOCATION (coding, 0) = 0;
3014           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3015           continue;
3016
3017         case ISO_single_shift_2_7:
3018         case ISO_single_shift_2:
3019           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3020             goto invalid_code;
3021           /* SS2 is handled as an escape sequence of ESC 'N' */
3022           c1 = 'N';
3023           goto label_escape_sequence;
3024
3025         case ISO_single_shift_3:
3026           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3027             goto invalid_code;
3028           /* SS2 is handled as an escape sequence of ESC 'O' */
3029           c1 = 'O';
3030           goto label_escape_sequence;
3031
3032         case ISO_control_sequence_introducer:
3033           /* CSI is handled as an escape sequence of ESC '[' ...  */
3034           c1 = '[';
3035           goto label_escape_sequence;
3036
3037         case ISO_escape:
3038           ONE_MORE_BYTE (c1);
3039         label_escape_sequence:
3040           /* Escape sequences handled here are invocation,
3041              designation, direction specification, and character
3042              composition specification.  */
3043           switch (c1)
3044             {
3045             case '&':           /* revision of following character set */
3046               ONE_MORE_BYTE (c1);
3047               if (!(c1 >= '@' && c1 <= '~'))
3048                 goto invalid_code;
3049               ONE_MORE_BYTE (c1);
3050               if (c1 != ISO_CODE_ESC)
3051                 goto invalid_code;
3052               ONE_MORE_BYTE (c1);
3053               goto label_escape_sequence;
3054
3055             case '$':           /* designation of 2-byte character set */
3056               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3057                 goto invalid_code;
3058               {
3059                 int reg, chars96;
3060
3061                 ONE_MORE_BYTE (c1);
3062                 if (c1 >= '@' && c1 <= 'B')
3063                   {     /* designation of JISX0208.1978, GB2312.1980,
3064                            or JISX0208.1980 */
3065                     reg = 0, chars96 = 0;
3066                   }
3067                 else if (c1 >= 0x28 && c1 <= 0x2B)
3068                   { /* designation of DIMENSION2_CHARS94 character set */
3069                     reg = c1 - 0x28, chars96 = 0;
3070                     ONE_MORE_BYTE (c1);
3071                   }
3072                 else if (c1 >= 0x2C && c1 <= 0x2F)
3073                   { /* designation of DIMENSION2_CHARS96 character set */
3074                     reg = c1 - 0x2C, chars96 = 1;
3075                     ONE_MORE_BYTE (c1);
3076                   }
3077                 else
3078                   goto invalid_code;
3079                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3080                 /* We must update these variables now.  */
3081                 if (reg == 0)
3082                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3083                 else if (reg == 1)
3084                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3085                 if (chars96 < 0)
3086                   goto invalid_code;
3087               }
3088               continue;
3089
3090             case 'n':           /* invocation of locking-shift-2 */
3091               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3092                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3093                 goto invalid_code;
3094               CODING_ISO_INVOCATION (coding, 0) = 2;
3095               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3096               continue;
3097
3098             case 'o':           /* invocation of locking-shift-3 */
3099               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3100                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3101                 goto invalid_code;
3102               CODING_ISO_INVOCATION (coding, 0) = 3;
3103               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3104               continue;
3105
3106             case 'N':           /* invocation of single-shift-2 */
3107               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3108                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3109                 goto invalid_code;
3110               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3111               if (charset_id_2 < 0)
3112                 charset = CHARSET_FROM_ID (charset_ascii);
3113               else
3114                 charset = CHARSET_FROM_ID (charset_id_2);
3115               ONE_MORE_BYTE (c1);
3116               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3117                 goto invalid_code;
3118               break;
3119
3120             case 'O':           /* invocation of single-shift-3 */
3121               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3122                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3123                 goto invalid_code;
3124               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3125               if (charset_id_3 < 0)
3126                 charset = CHARSET_FROM_ID (charset_ascii);
3127               else
3128                 charset = CHARSET_FROM_ID (charset_id_3);
3129               ONE_MORE_BYTE (c1);
3130               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3131                 goto invalid_code;
3132               break;
3133
3134             case '0': case '2': case '3': case '4': /* start composition */
3135               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3136                 goto invalid_code;
3137               DECODE_COMPOSITION_START (c1);
3138               continue;
3139
3140             case '1':           /* end composition */
3141               if (composition_state == COMPOSING_NO)
3142                 goto invalid_code;
3143               DECODE_COMPOSITION_END ();
3144               continue;
3145
3146             case '[':           /* specification of direction */
3147               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3148                 goto invalid_code;
3149               /* For the moment, nested direction is not supported.
3150                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3151                  left-to-right, and nozero means right-to-left.  */
3152               ONE_MORE_BYTE (c1);
3153               switch (c1)
3154                 {
3155                 case ']':       /* end of the current direction */
3156                   coding->mode &= ~CODING_MODE_DIRECTION;
3157
3158                 case '0':       /* end of the current direction */
3159                 case '1':       /* start of left-to-right direction */
3160                   ONE_MORE_BYTE (c1);
3161                   if (c1 == ']')
3162                     coding->mode &= ~CODING_MODE_DIRECTION;
3163                   else
3164                     goto invalid_code;
3165                   break;
3166
3167                 case '2':       /* start of right-to-left direction */
3168                   ONE_MORE_BYTE (c1);
3169                   if (c1 == ']')
3170                     coding->mode |= CODING_MODE_DIRECTION;
3171                   else
3172                     goto invalid_code;
3173                   break;
3174
3175                 default:
3176                   goto invalid_code;
3177                 }
3178               continue;
3179
3180             case '%':
3181               ONE_MORE_BYTE (c1);
3182               if (c1 == '/')
3183                 {
3184                   /* CTEXT extended segment:
3185                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3186                      We keep these bytes as is for the moment.
3187                      They may be decoded by post-read-conversion.  */
3188                   int dim, M, L;
3189                   int size;
3190
3191                   ONE_MORE_BYTE (dim);
3192                   ONE_MORE_BYTE (M);
3193                   ONE_MORE_BYTE (L);
3194                   size = ((M - 128) * 128) + (L - 128);
3195                   if (charbuf + 8 + size > charbuf_end)
3196                     goto break_loop;
3197                   *charbuf++ = ISO_CODE_ESC;
3198                   *charbuf++ = '%';
3199                   *charbuf++ = '/';
3200                   *charbuf++ = dim;
3201                   *charbuf++ = BYTE8_TO_CHAR (M);
3202                   *charbuf++ = BYTE8_TO_CHAR (L);
3203                   while (size-- > 0)
3204                     {
3205                       ONE_MORE_BYTE (c1);
3206                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3207                     }
3208                 }
3209               else if (c1 == 'G')
3210                 {
3211                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3212                      ESC % G --UTF-8-BYTES-- ESC % @
3213                      We keep these bytes as is for the moment.
3214                      They may be decoded by post-read-conversion.  */
3215                   int *p = charbuf;
3216
3217                   if (p + 6 > charbuf_end)
3218                     goto break_loop;
3219                   *p++ = ISO_CODE_ESC;
3220                   *p++ = '%';
3221                   *p++ = 'G';
3222                   while (p < charbuf_end)
3223                     {
3224                       ONE_MORE_BYTE (c1);
3225                       if (c1 == ISO_CODE_ESC
3226                           && src + 1 < src_end
3227                           && src[0] == '%'
3228                           && src[1] == '@')
3229                         {
3230                           src += 2;
3231                           break;
3232                         }
3233                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3234                     }
3235                   if (p + 3 > charbuf_end)
3236                     goto break_loop;
3237                   *p++ = ISO_CODE_ESC;
3238                   *p++ = '%';
3239                   *p++ = '@';
3240                   charbuf = p;
3241                 }
3242               else
3243                 goto invalid_code;
3244               continue;
3245               break;
3246
3247             default:
3248               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3249                 goto invalid_code;
3250               {
3251                 int reg, chars96;
3252
3253                 if (c1 >= 0x28 && c1 <= 0x2B)
3254                   { /* designation of DIMENSION1_CHARS94 character set */
3255                     reg = c1 - 0x28, chars96 = 0;
3256                     ONE_MORE_BYTE (c1);
3257                   }
3258                 else if (c1 >= 0x2C && c1 <= 0x2F)
3259                   { /* designation of DIMENSION1_CHARS96 character set */
3260                     reg = c1 - 0x2C, chars96 = 1;
3261                     ONE_MORE_BYTE (c1);
3262                   }
3263                 else
3264                   goto invalid_code;
3265                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3266                 /* We must update these variables now.  */
3267                 if (reg == 0)
3268                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3269                 else if (reg == 1)
3270                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3271                 if (chars96 < 0)
3272                   goto invalid_code;
3273               }
3274               continue;
3275             }
3276         }
3277
3278       if (charset->id != charset_ascii
3279           && last_id != charset->id)
3280         {
3281           if (last_id != charset_ascii)
3282             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3283           last_id = charset->id;
3284           last_offset = char_offset;
3285         }
3286
3287       /* Now we know CHARSET and 1st position code C1 of a character.
3288          Produce a decoded character while getting 2nd position code
3289          C2 if necessary.  */
3290       c1 &= 0x7F;
3291       if (CHARSET_DIMENSION (charset) > 1)
3292         {
3293           ONE_MORE_BYTE (c2);
3294           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3295             /* C2 is not in a valid range.  */
3296             goto invalid_code;
3297           c1 = (c1 << 8) | (c2 & 0x7F);
3298           if (CHARSET_DIMENSION (charset) > 2)
3299             {
3300               ONE_MORE_BYTE (c2);
3301               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3302                 /* C2 is not in a valid range.  */
3303                 goto invalid_code;
3304               c1 = (c1 << 8) | (c2 & 0x7F);
3305             }
3306         }
3307
3308       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3309       if (c < 0)
3310         {
3311           MAYBE_FINISH_COMPOSITION ();
3312           for (; src_base < src; src_base++, char_offset++)
3313             {
3314               if (ASCII_BYTE_P (*src_base))
3315                 *charbuf++ = *src_base;
3316               else
3317                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3318             }
3319         }
3320       else if (composition_state == COMPOSING_NO)
3321         {
3322           *charbuf++ = c;
3323           char_offset++;
3324         }
3325       else
3326         {
3327           components[component_idx++] = c;
3328           if (method == COMPOSITION_WITH_RULE
3329               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3330                   && composition_state == COMPOSING_COMPONENT_CHAR))
3331             composition_state++;
3332         }
3333       continue;
3334
3335     invalid_code:
3336       MAYBE_FINISH_COMPOSITION ();
3337       src = src_base;
3338       consumed_chars = consumed_chars_base;
3339       ONE_MORE_BYTE (c);
3340       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3341       char_offset++;
3342       coding->errors++;
3343       continue;
3344
3345     break_loop:
3346       break;
3347     }
3348
3349  no_more_source:
3350   if (last_id != charset_ascii)
3351     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3352   coding->consumed_char += consumed_chars_base;
3353   coding->consumed = src_base - coding->source;
3354   coding->charbuf_used = charbuf - coding->charbuf;
3355 }
3356
3357
3358 /* ISO2022 encoding stuff.  */
3359
3360 /*
3361    It is not enough to say just "ISO2022" on encoding, we have to
3362    specify more details.  In Emacs, each coding system of ISO2022
3363    variant has the following specifications:
3364         1. Initial designation to G0 thru G3.
3365         2. Allows short-form designation?
3366         3. ASCII should be designated to G0 before control characters?
3367         4. ASCII should be designated to G0 at end of line?
3368         5. 7-bit environment or 8-bit environment?
3369         6. Use locking-shift?
3370         7. Use Single-shift?
3371    And the following two are only for Japanese:
3372         8. Use ASCII in place of JIS0201-1976-Roman?
3373         9. Use JISX0208-1983 in place of JISX0208-1978?
3374    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3375    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3376    details.
3377 */
3378
3379 /* Produce codes (escape sequence) for designating CHARSET to graphic
3380    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3381    '@', 'A', or 'B' and the coding system CODING allows, produce
3382    designation sequence of short-form.  */
3383
3384 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3385   do {                                                                  \
3386     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3387     char *intermediate_char_94 = "()*+";                                \
3388     char *intermediate_char_96 = ",-./";                                \
3389     int revision = -1;                                                  \
3390     int c;                                                              \
3391                                                                         \
3392     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3393       revision = CHARSET_ISO_REVISION (charset);                        \
3394                                                                         \
3395     if (revision >= 0)                                                  \
3396       {                                                                 \
3397         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3398         EMIT_ONE_BYTE ('@' + revision);                                 \
3399       }                                                                 \
3400     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3401     if (CHARSET_DIMENSION (charset) == 1)                               \
3402       {                                                                 \
3403         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3404           c = intermediate_char_94[reg];                                \
3405         else                                                            \
3406           c = intermediate_char_96[reg];                                \
3407         EMIT_ONE_ASCII_BYTE (c);                                        \
3408       }                                                                 \
3409     else                                                                \
3410       {                                                                 \
3411         EMIT_ONE_ASCII_BYTE ('$');                                      \
3412         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3413           {                                                             \
3414             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3415                 || reg != 0                                             \
3416                 || final_char < '@' || final_char > 'B')                \
3417               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3418           }                                                             \
3419         else                                                            \
3420           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3421       }                                                                 \
3422     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3423                                                                         \
3424     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3425   } while (0)
3426
3427
3428 /* The following two macros produce codes (control character or escape
3429    sequence) for ISO2022 single-shift functions (single-shift-2 and
3430    single-shift-3).  */
3431
3432 #define ENCODE_SINGLE_SHIFT_2                                           \
3433   do {                                                                  \
3434     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3435       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3436     else                                                                \
3437       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3438     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3439   } while (0)
3440
3441
3442 #define ENCODE_SINGLE_SHIFT_3                                           \
3443   do {                                                                  \
3444     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3445       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3446     else                                                                \
3447       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3448     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3449   } while (0)
3450
3451
3452 /* The following four macros produce codes (control character or
3453    escape sequence) for ISO2022 locking-shift functions (shift-in,
3454    shift-out, locking-shift-2, and locking-shift-3).  */
3455
3456 #define ENCODE_SHIFT_IN                                 \
3457   do {                                                  \
3458     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3459     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3460   } while (0)
3461
3462
3463 #define ENCODE_SHIFT_OUT                                \
3464   do {                                                  \
3465     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3466     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3467   } while (0)
3468
3469
3470 #define ENCODE_LOCKING_SHIFT_2                          \
3471   do {                                                  \
3472     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3473     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3474   } while (0)
3475
3476
3477 #define ENCODE_LOCKING_SHIFT_3                          \
3478   do {                                                  \
3479     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3480     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3481   } while (0)
3482
3483
3484 /* Produce codes for a DIMENSION1 character whose character set is
3485    CHARSET and whose position-code is C1.  Designation and invocation
3486    sequences are also produced in advance if necessary.  */
3487
3488 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3489   do {                                                                  \
3490     int id = CHARSET_ID (charset);                                      \
3491                                                                         \
3492     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3493         && id == charset_ascii)                                         \
3494       {                                                                 \
3495         id = charset_jisx0201_roman;                                    \
3496         charset = CHARSET_FROM_ID (id);                                 \
3497       }                                                                 \
3498                                                                         \
3499     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3500       {                                                                 \
3501         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3502           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3503         else                                                            \
3504           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3505         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3506         break;                                                          \
3507       }                                                                 \
3508     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3509       {                                                                 \
3510         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3511         break;                                                          \
3512       }                                                                 \
3513     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3514       {                                                                 \
3515         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3516         break;                                                          \
3517       }                                                                 \
3518     else                                                                \
3519       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3520          must invoke it, or, at first, designate it to some graphic     \
3521          register.  Then repeat the loop to actually produce the        \
3522          character.  */                                                 \
3523       dst = encode_invocation_designation (charset, coding, dst,        \
3524                                            &produced_chars);            \
3525   } while (1)
3526
3527
3528 /* Produce codes for a DIMENSION2 character whose character set is
3529    CHARSET and whose position-codes are C1 and C2.  Designation and
3530    invocation codes are also produced in advance if necessary.  */
3531
3532 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3533   do {                                                                  \
3534     int id = CHARSET_ID (charset);                                      \
3535                                                                         \
3536     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3537         && id == charset_jisx0208)                                      \
3538       {                                                                 \
3539         id = charset_jisx0208_1978;                                     \
3540         charset = CHARSET_FROM_ID (id);                                 \
3541       }                                                                 \
3542                                                                         \
3543     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3544       {                                                                 \
3545         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3546           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3547         else                                                            \
3548           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3549         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3550         break;                                                          \
3551       }                                                                 \
3552     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3553       {                                                                 \
3554         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3555         break;                                                          \
3556       }                                                                 \
3557     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3558       {                                                                 \
3559         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3560         break;                                                          \
3561       }                                                                 \
3562     else                                                                \
3563       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3564          must invoke it, or, at first, designate it to some graphic     \
3565          register.  Then repeat the loop to actually produce the        \
3566          character.  */                                                 \
3567       dst = encode_invocation_designation (charset, coding, dst,        \
3568                                            &produced_chars);            \
3569   } while (1)
3570
3571
3572 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3573   do {                                                                     \
3574     int code = ENCODE_CHAR ((charset),(c));                                \
3575                                                                            \
3576     if (CHARSET_DIMENSION (charset) == 1)                                  \
3577       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3578     else                                                                   \
3579       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3580   } while (0)
3581
3582
3583 /* Produce designation and invocation codes at a place pointed by DST
3584    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3585    Return new DST.  */
3586
3587 unsigned char *
3588 encode_invocation_designation (charset, coding, dst, p_nchars)
3589      struct charset *charset;
3590      struct coding_system *coding;
3591      unsigned char *dst;
3592      int *p_nchars;
3593 {
3594   int multibytep = coding->dst_multibyte;
3595   int produced_chars = *p_nchars;
3596   int reg;                      /* graphic register number */
3597   int id = CHARSET_ID (charset);
3598
3599   /* At first, check designations.  */
3600   for (reg = 0; reg < 4; reg++)
3601     if (id == CODING_ISO_DESIGNATION (coding, reg))
3602       break;
3603
3604   if (reg >= 4)
3605     {
3606       /* CHARSET is not yet designated to any graphic registers.  */
3607       /* At first check the requested designation.  */
3608       reg = CODING_ISO_REQUEST (coding, id);
3609       if (reg < 0)
3610         /* Since CHARSET requests no special designation, designate it
3611            to graphic register 0.  */
3612         reg = 0;
3613
3614       ENCODE_DESIGNATION (charset, reg, coding);
3615     }
3616
3617   if (CODING_ISO_INVOCATION (coding, 0) != reg
3618       && CODING_ISO_INVOCATION (coding, 1) != reg)
3619     {
3620       /* Since the graphic register REG is not invoked to any graphic
3621          planes, invoke it to graphic plane 0.  */
3622       switch (reg)
3623         {
3624         case 0:                 /* graphic register 0 */
3625           ENCODE_SHIFT_IN;
3626           break;
3627
3628         case 1:                 /* graphic register 1 */
3629           ENCODE_SHIFT_OUT;
3630           break;
3631
3632         case 2:                 /* graphic register 2 */
3633           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3634             ENCODE_SINGLE_SHIFT_2;
3635           else
3636             ENCODE_LOCKING_SHIFT_2;
3637           break;
3638
3639         case 3:                 /* graphic register 3 */
3640           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3641             ENCODE_SINGLE_SHIFT_3;
3642           else
3643             ENCODE_LOCKING_SHIFT_3;
3644           break;
3645         }
3646     }
3647
3648   *p_nchars = produced_chars;
3649   return dst;
3650 }
3651
3652 /* The following three macros produce codes for indicating direction
3653    of text.  */
3654 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3655   do {                                                                  \
3656     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3657       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3658     else                                                                \
3659       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3660   } while (0)
3661
3662
3663 #define ENCODE_DIRECTION_R2L()                  \
3664   do {                                          \
3665     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3666     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3667   } while (0)
3668
3669
3670 #define ENCODE_DIRECTION_L2R()                  \
3671   do {                                          \
3672     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3673     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3674   } while (0)
3675
3676
3677 /* Produce codes for designation and invocation to reset the graphic
3678    planes and registers to initial state.  */
3679 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3680   do {                                                                  \
3681     int reg;                                                            \
3682     struct charset *charset;                                            \
3683                                                                         \
3684     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3685       ENCODE_SHIFT_IN;                                                  \
3686     for (reg = 0; reg < 4; reg++)                                       \
3687       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3688           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3689               != CODING_ISO_INITIAL (coding, reg)))                     \
3690         {                                                               \
3691           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3692           ENCODE_DESIGNATION (charset, reg, coding);                    \
3693         }                                                               \
3694   } while (0)
3695
3696
3697 /* Produce designation sequences of charsets in the line started from
3698    SRC to a place pointed by DST, and return updated DST.
3699
3700    If the current block ends before any end-of-line, we may fail to
3701    find all the necessary designations.  */
3702
3703 static unsigned char *
3704 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3705      struct coding_system *coding;
3706      int *charbuf, *charbuf_end;
3707      unsigned char *dst;
3708 {
3709   struct charset *charset;
3710   /* Table of charsets to be designated to each graphic register.  */
3711   int r[4];
3712   int c, found = 0, reg;
3713   int produced_chars = 0;
3714   int multibytep = coding->dst_multibyte;
3715   Lisp_Object attrs;
3716   Lisp_Object charset_list;
3717
3718   attrs = CODING_ID_ATTRS (coding->id);
3719   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3720   if (EQ (charset_list, Qiso_2022))
3721     charset_list = Viso_2022_charset_list;
3722
3723   for (reg = 0; reg < 4; reg++)
3724     r[reg] = -1;
3725
3726   while (found < 4)
3727     {
3728       int id;
3729
3730       c = *charbuf++;
3731       if (c == '\n')
3732         break;
3733       charset = char_charset (c, charset_list, NULL);
3734       id = CHARSET_ID (charset);
3735       reg = CODING_ISO_REQUEST (coding, id);
3736       if (reg >= 0 && r[reg] < 0)
3737         {
3738           found++;
3739           r[reg] = id;
3740         }
3741     }
3742
3743   if (found)
3744     {
3745       for (reg = 0; reg < 4; reg++)
3746         if (r[reg] >= 0
3747             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3748           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3749     }
3750
3751   return dst;
3752 }
3753
3754 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3755
3756 static int
3757 encode_coding_iso_2022 (coding)
3758      struct coding_system *coding;
3759 {
3760   int multibytep = coding->dst_multibyte;
3761   int *charbuf = coding->charbuf;
3762   int *charbuf_end = charbuf + coding->charbuf_used;
3763   unsigned char *dst = coding->destination + coding->produced;
3764   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3765   int safe_room = 16;
3766   int bol_designation
3767     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3768        && CODING_ISO_BOL (coding));
3769   int produced_chars = 0;
3770   Lisp_Object attrs, eol_type, charset_list;
3771   int ascii_compatible;
3772   int c;
3773   int preferred_charset_id = -1;
3774
3775   CODING_GET_INFO (coding, attrs, charset_list);
3776   eol_type = CODING_ID_EOL_TYPE (coding->id);
3777   if (VECTORP (eol_type))
3778     eol_type = Qunix;
3779
3780   setup_iso_safe_charsets (attrs);
3781   /* Charset list may have been changed.  */
3782   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3783   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3784
3785   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3786
3787   while (charbuf < charbuf_end)
3788     {
3789       ASSURE_DESTINATION (safe_room);
3790
3791       if (bol_designation)
3792         {
3793           unsigned char *dst_prev = dst;
3794
3795           /* We have to produce designation sequences if any now.  */
3796           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3797           bol_designation = 0;
3798           /* We are sure that designation sequences are all ASCII bytes.  */
3799           produced_chars += dst - dst_prev;
3800         }
3801
3802       c = *charbuf++;
3803
3804       if (c < 0)
3805         {
3806           /* Handle an annotation.  */
3807           switch (*charbuf)
3808             {
3809             case CODING_ANNOTATE_COMPOSITION_MASK:
3810               /* Not yet implemented.  */
3811               break;
3812             case CODING_ANNOTATE_CHARSET_MASK:
3813               preferred_charset_id = charbuf[2];
3814               if (preferred_charset_id >= 0
3815                   && NILP (Fmemq (make_number (preferred_charset_id),
3816                                   charset_list)))
3817                 preferred_charset_id = -1;
3818               break;
3819             default:
3820               abort ();
3821             }
3822           charbuf += -c - 1;
3823           continue;
3824         }
3825
3826       /* Now encode the character C.  */
3827       if (c < 0x20 || c == 0x7F)
3828         {
3829           if (c == '\n'
3830               || (c == '\r' && EQ (eol_type, Qmac)))
3831             {
3832               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3833                 ENCODE_RESET_PLANE_AND_REGISTER ();
3834               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3835                 {
3836                   int i;
3837
3838                   for (i = 0; i < 4; i++)
3839                     CODING_ISO_DESIGNATION (coding, i)
3840                       = CODING_ISO_INITIAL (coding, i);
3841                 }
3842               bol_designation
3843                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3844             }
3845           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3846             ENCODE_RESET_PLANE_AND_REGISTER ();
3847           EMIT_ONE_ASCII_BYTE (c);
3848         }
3849       else if (ASCII_CHAR_P (c))
3850         {
3851           if (ascii_compatible)
3852             EMIT_ONE_ASCII_BYTE (c);
3853           else
3854             {
3855               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3856               ENCODE_ISO_CHARACTER (charset, c);
3857             }
3858         }
3859       else if (CHAR_BYTE8_P (c))
3860         {
3861           c = CHAR_TO_BYTE8 (c);
3862           EMIT_ONE_BYTE (c);
3863         }
3864       else
3865         {
3866           struct charset *charset;
3867
3868           if (preferred_charset_id >= 0)
3869             {
3870               charset = CHARSET_FROM_ID (preferred_charset_id);
3871               if (! CHAR_CHARSET_P (c, charset))
3872                 charset = char_charset (c, charset_list, NULL);
3873             }
3874           else
3875             charset = char_charset (c, charset_list, NULL);
3876           if (!charset)
3877             {
3878               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3879                 {
3880                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3881                   charset = CHARSET_FROM_ID (charset_ascii);
3882                 }
3883               else
3884                 {
3885                   c = coding->default_char;
3886                   charset = char_charset (c, charset_list, NULL);
3887                 }
3888             }
3889           ENCODE_ISO_CHARACTER (charset, c);
3890         }
3891     }
3892
3893   if (coding->mode & CODING_MODE_LAST_BLOCK
3894       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3895     {
3896       ASSURE_DESTINATION (safe_room);
3897       ENCODE_RESET_PLANE_AND_REGISTER ();
3898     }
3899   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3900   CODING_ISO_BOL (coding) = bol_designation;
3901   coding->produced_char += produced_chars;
3902   coding->produced = dst - coding->destination;
3903   return 0;
3904 }
3905
3906 \f
3907 /*** 8,9. SJIS and BIG5 handlers ***/
3908
3909 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3910    quite widely.  So, for the moment, Emacs supports them in the bare
3911    C code.  But, in the future, they may be supported only by CCL.  */
3912
3913 /* SJIS is a coding system encoding three character sets: ASCII, right
3914    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3915    as is.  A character of charset katakana-jisx0201 is encoded by
3916    "position-code + 0x80".  A character of charset japanese-jisx0208
3917    is encoded in 2-byte but two position-codes are divided and shifted
3918    so that it fit in the range below.
3919
3920    --- CODE RANGE of SJIS ---
3921    (character set)      (range)
3922    ASCII                0x00 .. 0x7F
3923    KATAKANA-JISX0201    0xA0 .. 0xDF
3924    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3925             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3926    -------------------------------
3927
3928 */
3929
3930 /* BIG5 is a coding system encoding two character sets: ASCII and
3931    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3932    character set and is encoded in two-byte.
3933
3934    --- CODE RANGE of BIG5 ---
3935    (character set)      (range)
3936    ASCII                0x00 .. 0x7F
3937    Big5 (1st byte)      0xA1 .. 0xFE
3938         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3939    --------------------------
3940
3941   */
3942
3943 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3944    Check if a text is encoded in SJIS.  If it is, return
3945    CATEGORY_MASK_SJIS, else return 0.  */
3946
3947 static int
3948 detect_coding_sjis (coding, detect_info)
3949      struct coding_system *coding;
3950      struct coding_detection_info *detect_info;
3951 {
3952   const unsigned char *src = coding->source, *src_base;
3953   const unsigned char *src_end = coding->source + coding->src_bytes;
3954   int multibytep = coding->src_multibyte;
3955   int consumed_chars = 0;
3956   int found = 0;
3957   int c;
3958
3959   detect_info->checked |= CATEGORY_MASK_SJIS;
3960   /* A coding system of this category is always ASCII compatible.  */
3961   src += coding->head_ascii;
3962
3963   while (1)
3964     {
3965       src_base = src;
3966       ONE_MORE_BYTE (c);
3967       if (c < 0x80)
3968         continue;
3969       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3970         {
3971           ONE_MORE_BYTE (c);
3972           if (c < 0x40 || c == 0x7F || c > 0xFC)
3973             break;
3974           found = CATEGORY_MASK_SJIS;
3975         }
3976       else if (c >= 0xA0 && c < 0xE0)
3977         found = CATEGORY_MASK_SJIS;
3978       else
3979         break;
3980     }
3981   detect_info->rejected |= CATEGORY_MASK_SJIS;
3982   return 0;
3983
3984  no_more_source:
3985   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3986     {
3987       detect_info->rejected |= CATEGORY_MASK_SJIS;
3988       return 0;
3989     }
3990   detect_info->found |= found;
3991   return 1;
3992 }
3993
3994 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3995    Check if a text is encoded in BIG5.  If it is, return
3996    CATEGORY_MASK_BIG5, else return 0.  */
3997
3998 static int
3999 detect_coding_big5 (coding, detect_info)
4000      struct coding_system *coding;
4001      struct coding_detection_info *detect_info;
4002 {
4003   const unsigned char *src = coding->source, *src_base;
4004   const unsigned char *src_end = coding->source + coding->src_bytes;
4005   int multibytep = coding->src_multibyte;
4006   int consumed_chars = 0;
4007   int found = 0;
4008   int c;
4009
4010   detect_info->checked |= CATEGORY_MASK_BIG5;
4011   /* A coding system of this category is always ASCII compatible.  */
4012   src += coding->head_ascii;
4013
4014   while (1)
4015     {
4016       src_base = src;
4017       ONE_MORE_BYTE (c);
4018       if (c < 0x80)
4019         continue;
4020       if (c >= 0xA1)
4021         {
4022           ONE_MORE_BYTE (c);
4023           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4024             return 0;
4025           found = CATEGORY_MASK_BIG5;
4026         }
4027       else
4028         break;
4029     }
4030   detect_info->rejected |= CATEGORY_MASK_BIG5;
4031   return 0;
4032
4033  no_more_source:
4034   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4035     {
4036       detect_info->rejected |= CATEGORY_MASK_BIG5;
4037       return 0;
4038     }
4039   detect_info->found |= found;
4040   return 1;
4041 }
4042
4043 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4044    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4045
4046 static void
4047 decode_coding_sjis (coding)
4048      struct coding_system *coding;
4049 {
4050   const unsigned char *src = coding->source + coding->consumed;
4051   const unsigned char *src_end = coding->source + coding->src_bytes;
4052   const unsigned char *src_base;
4053   int *charbuf = coding->charbuf + coding->charbuf_used;
4054   int *charbuf_end
4055     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4056   int consumed_chars = 0, consumed_chars_base;
4057   int multibytep = coding->src_multibyte;
4058   struct charset *charset_roman, *charset_kanji, *charset_kana;
4059   struct charset *charset_kanji2;
4060   Lisp_Object attrs, charset_list, val;
4061   int char_offset = coding->produced_char;
4062   int last_offset = char_offset;
4063   int last_id = charset_ascii;
4064
4065   CODING_GET_INFO (coding, attrs, charset_list);
4066
4067   val = charset_list;
4068   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4069   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4070   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4071   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4072
4073   while (1)
4074     {
4075       int c, c1;
4076       struct charset *charset;
4077
4078       src_base = src;
4079       consumed_chars_base = consumed_chars;
4080
4081       if (charbuf >= charbuf_end)
4082         break;
4083
4084       ONE_MORE_BYTE (c);
4085       if (c < 0)
4086         goto invalid_code;
4087       if (c < 0x80)
4088         charset = charset_roman;
4089       else if (c == 0x80 || c == 0xA0)
4090         goto invalid_code;
4091       else if (c >= 0xA1 && c <= 0xDF)
4092         {
4093           /* SJIS -> JISX0201-Kana */
4094           c &= 0x7F;
4095           charset = charset_kana;
4096         }
4097       else if (c <= 0xEF)
4098         {
4099           /* SJIS -> JISX0208 */
4100           ONE_MORE_BYTE (c1);
4101           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4102             goto invalid_code;
4103           c = (c << 8) | c1;
4104           SJIS_TO_JIS (c);
4105           charset = charset_kanji;
4106         }
4107       else if (c <= 0xFC && charset_kanji2)
4108         {
4109           /* SJIS -> JISX0213-2 */
4110           ONE_MORE_BYTE (c1);
4111           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4112             goto invalid_code;
4113           c = (c << 8) | c1;
4114           SJIS_TO_JIS2 (c);
4115           charset = charset_kanji2;
4116         }
4117       else
4118         goto invalid_code;
4119       if (charset->id != charset_ascii
4120           && last_id != charset->id)
4121         {
4122           if (last_id != charset_ascii)
4123             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4124           last_id = charset->id;
4125           last_offset = char_offset;
4126         }
4127       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4128       *charbuf++ = c;
4129       char_offset++;
4130       continue;
4131
4132     invalid_code:
4133       src = src_base;
4134       consumed_chars = consumed_chars_base;
4135       ONE_MORE_BYTE (c);
4136       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4137       char_offset++;
4138       coding->errors++;
4139     }
4140
4141  no_more_source:
4142   if (last_id != charset_ascii)
4143     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4144   coding->consumed_char += consumed_chars_base;
4145   coding->consumed = src_base - coding->source;
4146   coding->charbuf_used = charbuf - coding->charbuf;
4147 }
4148
4149 static void
4150 decode_coding_big5 (coding)
4151      struct coding_system *coding;
4152 {
4153   const unsigned char *src = coding->source + coding->consumed;
4154   const unsigned char *src_end = coding->source + coding->src_bytes;
4155   const unsigned char *src_base;
4156   int *charbuf = coding->charbuf + coding->charbuf_used;
4157   int *charbuf_end
4158     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4159   int consumed_chars = 0, consumed_chars_base;
4160   int multibytep = coding->src_multibyte;
4161   struct charset *charset_roman, *charset_big5;
4162   Lisp_Object attrs, charset_list, val;
4163   int char_offset = coding->produced_char;
4164   int last_offset = char_offset;
4165   int last_id = charset_ascii;
4166
4167   CODING_GET_INFO (coding, attrs, charset_list);
4168   val = charset_list;
4169   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4170   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4171
4172   while (1)
4173     {
4174       int c, c1;
4175       struct charset *charset;
4176
4177       src_base = src;
4178       consumed_chars_base = consumed_chars;
4179
4180       if (charbuf >= charbuf_end)
4181         break;
4182
4183       ONE_MORE_BYTE (c);
4184
4185       if (c < 0)
4186         goto invalid_code;
4187       if (c < 0x80)
4188         charset = charset_roman;
4189       else
4190         {
4191           /* BIG5 -> Big5 */
4192           if (c < 0xA1 || c > 0xFE)
4193             goto invalid_code;
4194           ONE_MORE_BYTE (c1);
4195           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4196             goto invalid_code;
4197           c = c << 8 | c1;
4198           charset = charset_big5;
4199         }
4200       if (charset->id != charset_ascii
4201           && last_id != charset->id)
4202         {
4203           if (last_id != charset_ascii)
4204             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4205           last_id = charset->id;
4206           last_offset = char_offset;
4207         }
4208       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4209       *charbuf++ = c;
4210       char_offset++;
4211       continue;
4212
4213     invalid_code:
4214       src = src_base;
4215       consumed_chars = consumed_chars_base;
4216       ONE_MORE_BYTE (c);
4217       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4218       char_offset++;
4219       coding->errors++;
4220     }
4221
4222  no_more_source:
4223   if (last_id != charset_ascii)
4224     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4225   coding->consumed_char += consumed_chars_base;
4226   coding->consumed = src_base - coding->source;
4227   coding->charbuf_used = charbuf - coding->charbuf;
4228 }
4229
4230 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4231    This function can encode charsets `ascii', `katakana-jisx0201',
4232    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4233    are sure that all these charsets are registered as official charset
4234    (i.e. do not have extended leading-codes).  Characters of other
4235    charsets are produced without any encoding.  If SJIS_P is 1, encode
4236    SJIS text, else encode BIG5 text.  */
4237
4238 static int
4239 encode_coding_sjis (coding)
4240      struct coding_system *coding;
4241 {
4242   int multibytep = coding->dst_multibyte;
4243   int *charbuf = coding->charbuf;
4244   int *charbuf_end = charbuf + coding->charbuf_used;
4245   unsigned char *dst = coding->destination + coding->produced;
4246   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4247   int safe_room = 4;
4248   int produced_chars = 0;
4249   Lisp_Object attrs, charset_list, val;
4250   int ascii_compatible;
4251   struct charset *charset_roman, *charset_kanji, *charset_kana;
4252   struct charset *charset_kanji2;
4253   int c;
4254
4255   CODING_GET_INFO (coding, attrs, charset_list);
4256   val = charset_list;
4257   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4258   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4259   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4260   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4261
4262   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4263
4264   while (charbuf < charbuf_end)
4265     {
4266       ASSURE_DESTINATION (safe_room);
4267       c = *charbuf++;
4268       /* Now encode the character C.  */
4269       if (ASCII_CHAR_P (c) && ascii_compatible)
4270         EMIT_ONE_ASCII_BYTE (c);
4271       else if (CHAR_BYTE8_P (c))
4272         {
4273           c = CHAR_TO_BYTE8 (c);
4274           EMIT_ONE_BYTE (c);
4275         }
4276       else
4277         {
4278           unsigned code;
4279           struct charset *charset = char_charset (c, charset_list, &code);
4280
4281           if (!charset)
4282             {
4283               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4284                 {
4285                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4286                   charset = CHARSET_FROM_ID (charset_ascii);
4287                 }
4288               else
4289                 {
4290                   c = coding->default_char;
4291                   charset = char_charset (c, charset_list, &code);
4292                 }
4293             }
4294           if (code == CHARSET_INVALID_CODE (charset))
4295             abort ();
4296           if (charset == charset_kanji)
4297             {
4298               int c1, c2;
4299               JIS_TO_SJIS (code);
4300               c1 = code >> 8, c2 = code & 0xFF;
4301               EMIT_TWO_BYTES (c1, c2);
4302             }
4303           else if (charset == charset_kana)
4304             EMIT_ONE_BYTE (code | 0x80);
4305           else if (charset_kanji2 && charset == charset_kanji2)
4306             {
4307               int c1, c2;
4308
4309               c1 = code >> 8;
4310               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4311                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4312                 {
4313                   JIS_TO_SJIS2 (code);
4314                   c1 = code >> 8, c2 = code & 0xFF;
4315                   EMIT_TWO_BYTES (c1, c2);
4316                 }
4317               else
4318                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4319             }
4320           else
4321             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4322         }
4323     }
4324   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4325   coding->produced_char += produced_chars;
4326   coding->produced = dst - coding->destination;
4327   return 0;
4328 }
4329
4330 static int
4331 encode_coding_big5 (coding)
4332      struct coding_system *coding;
4333 {
4334   int multibytep = coding->dst_multibyte;
4335   int *charbuf = coding->charbuf;
4336   int *charbuf_end = charbuf + coding->charbuf_used;
4337   unsigned char *dst = coding->destination + coding->produced;
4338   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4339   int safe_room = 4;
4340   int produced_chars = 0;
4341   Lisp_Object attrs, charset_list, val;
4342   int ascii_compatible;
4343   struct charset *charset_roman, *charset_big5;
4344   int c;
4345
4346   CODING_GET_INFO (coding, attrs, charset_list);
4347   val = charset_list;
4348   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4349   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4350   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4351
4352   while (charbuf < charbuf_end)
4353     {
4354       ASSURE_DESTINATION (safe_room);
4355       c = *charbuf++;
4356       /* Now encode the character C.  */
4357       if (ASCII_CHAR_P (c) && ascii_compatible)
4358         EMIT_ONE_ASCII_BYTE (c);
4359       else if (CHAR_BYTE8_P (c))
4360         {
4361           c = CHAR_TO_BYTE8 (c);
4362           EMIT_ONE_BYTE (c);
4363         }
4364       else
4365         {
4366           unsigned code;
4367           struct charset *charset = char_charset (c, charset_list, &code);
4368
4369           if (! charset)
4370             {
4371               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4372                 {
4373                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4374                   charset = CHARSET_FROM_ID (charset_ascii);
4375                 }
4376               else
4377                 {
4378                   c = coding->default_char;
4379                   charset = char_charset (c, charset_list, &code);
4380                 }
4381             }
4382           if (code == CHARSET_INVALID_CODE (charset))
4383             abort ();
4384           if (charset == charset_big5)
4385             {
4386               int c1, c2;
4387
4388               c1 = code >> 8, c2 = code & 0xFF;
4389               EMIT_TWO_BYTES (c1, c2);
4390             }
4391           else
4392             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4393         }
4394     }
4395   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4396   coding->produced_char += produced_chars;
4397   coding->produced = dst - coding->destination;
4398   return 0;
4399 }
4400
4401 \f
4402 /*** 10. CCL handlers ***/
4403
4404 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4405    Check if a text is encoded in a coding system of which
4406    encoder/decoder are written in CCL program.  If it is, return
4407    CATEGORY_MASK_CCL, else return 0.  */
4408
4409 static int
4410 detect_coding_ccl (coding, detect_info)
4411      struct coding_system *coding;
4412      struct coding_detection_info *detect_info;
4413 {
4414   const unsigned char *src = coding->source, *src_base;
4415   const unsigned char *src_end = coding->source + coding->src_bytes;
4416   int multibytep = coding->src_multibyte;
4417   int consumed_chars = 0;
4418   int found = 0;
4419   unsigned char *valids;
4420   int head_ascii = coding->head_ascii;
4421   Lisp_Object attrs;
4422
4423   detect_info->checked |= CATEGORY_MASK_CCL;
4424
4425   coding = &coding_categories[coding_category_ccl];
4426   valids = CODING_CCL_VALIDS (coding);
4427   attrs = CODING_ID_ATTRS (coding->id);
4428   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4429     src += head_ascii;
4430
4431   while (1)
4432     {
4433       int c;
4434
4435       src_base = src;
4436       ONE_MORE_BYTE (c);
4437       if (c < 0 || ! valids[c])
4438         break;
4439       if ((valids[c] > 1))
4440         found = CATEGORY_MASK_CCL;
4441     }
4442   detect_info->rejected |= CATEGORY_MASK_CCL;
4443   return 0;
4444
4445  no_more_source:
4446   detect_info->found |= found;
4447   return 1;
4448 }
4449
4450 static void
4451 decode_coding_ccl (coding)
4452      struct coding_system *coding;
4453 {
4454   const unsigned char *src = coding->source + coding->consumed;
4455   const unsigned char *src_end = coding->source + coding->src_bytes;
4456   int *charbuf = coding->charbuf + coding->charbuf_used;
4457   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4458   int consumed_chars = 0;
4459   int multibytep = coding->src_multibyte;
4460   struct ccl_program ccl;
4461   int source_charbuf[1024];
4462   int source_byteidx[1024];
4463   Lisp_Object attrs, charset_list;
4464
4465   CODING_GET_INFO (coding, attrs, charset_list);
4466   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4467
4468   while (src < src_end)
4469     {
4470       const unsigned char *p = src;
4471       int *source, *source_end;
4472       int i = 0;
4473
4474       if (multibytep)
4475         while (i < 1024 && p < src_end)
4476           {
4477             source_byteidx[i] = p - src;
4478             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4479           }
4480       else
4481         while (i < 1024 && p < src_end)
4482           source_charbuf[i++] = *p++;
4483
4484       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4485         ccl.last_block = 1;
4486
4487       source = source_charbuf;
4488       source_end = source + i;
4489       while (source < source_end)
4490         {
4491           ccl_driver (&ccl, source, charbuf,
4492                       source_end - source, charbuf_end - charbuf,
4493                       charset_list);
4494           source += ccl.consumed;
4495           charbuf += ccl.produced;
4496           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4497             break;
4498         }
4499       if (source < source_end)
4500         src += source_byteidx[source - source_charbuf];
4501       else
4502         src = p;
4503       consumed_chars += source - source_charbuf;
4504
4505       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4506           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4507         break;
4508     }
4509
4510   switch (ccl.status)
4511     {
4512     case CCL_STAT_SUSPEND_BY_SRC:
4513       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4514       break;
4515     case CCL_STAT_SUSPEND_BY_DST:
4516       break;
4517     case CCL_STAT_QUIT:
4518     case CCL_STAT_INVALID_CMD:
4519       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4520       break;
4521     default:
4522       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4523       break;
4524     }
4525   coding->consumed_char += consumed_chars;
4526   coding->consumed = src - coding->source;
4527   coding->charbuf_used = charbuf - coding->charbuf;
4528 }
4529
4530 static int
4531 encode_coding_ccl (coding)
4532      struct coding_system *coding;
4533 {
4534   struct ccl_program ccl;
4535   int multibytep = coding->dst_multibyte;
4536   int *charbuf = coding->charbuf;
4537   int *charbuf_end = charbuf + coding->charbuf_used;
4538   unsigned char *dst = coding->destination + coding->produced;
4539   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4540   unsigned char *adjusted_dst_end = dst_end - 1;
4541   int destination_charbuf[1024];
4542   int i, produced_chars = 0;
4543   Lisp_Object attrs, charset_list;
4544
4545   CODING_GET_INFO (coding, attrs, charset_list);
4546   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4547
4548   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4549   ccl.dst_multibyte = coding->dst_multibyte;
4550
4551   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4552     {
4553       int dst_bytes = dst_end - dst;
4554       if (dst_bytes > 1024)
4555         dst_bytes = 1024;
4556
4557       ccl_driver (&ccl, charbuf, destination_charbuf,
4558                   charbuf_end - charbuf, dst_bytes, charset_list);
4559       charbuf += ccl.consumed;
4560       if (multibytep)
4561         for (i = 0; i < ccl.produced; i++)
4562           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4563       else
4564         {
4565           for (i = 0; i < ccl.produced; i++)
4566             *dst++ = destination_charbuf[i] & 0xFF;
4567           produced_chars += ccl.produced;
4568         }
4569     }
4570
4571   switch (ccl.status)
4572     {
4573     case CCL_STAT_SUSPEND_BY_SRC:
4574       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4575       break;
4576     case CCL_STAT_SUSPEND_BY_DST:
4577       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4578       break;
4579     case CCL_STAT_QUIT:
4580     case CCL_STAT_INVALID_CMD:
4581       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4582       break;
4583     default:
4584       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4585       break;
4586     }
4587
4588   coding->produced_char += produced_chars;
4589   coding->produced = dst - coding->destination;
4590   return 0;
4591 }
4592
4593
4594 \f
4595 /*** 10, 11. no-conversion handlers ***/
4596
4597 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4598
4599 static void
4600 decode_coding_raw_text (coding)
4601      struct coding_system *coding;
4602 {
4603   coding->chars_at_source = 1;
4604   coding->consumed_char = 0;
4605   coding->consumed = 0;
4606   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4607 }
4608
4609 static int
4610 encode_coding_raw_text (coding)
4611      struct coding_system *coding;
4612 {
4613   int multibytep = coding->dst_multibyte;
4614   int *charbuf = coding->charbuf;
4615   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4616   unsigned char *dst = coding->destination + coding->produced;
4617   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4618   int produced_chars = 0;
4619   int c;
4620
4621   if (multibytep)
4622     {
4623       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4624
4625       if (coding->src_multibyte)
4626         while (charbuf < charbuf_end)
4627           {
4628             ASSURE_DESTINATION (safe_room);
4629             c = *charbuf++;
4630             if (ASCII_CHAR_P (c))
4631               EMIT_ONE_ASCII_BYTE (c);
4632             else if (CHAR_BYTE8_P (c))
4633               {
4634                 c = CHAR_TO_BYTE8 (c);
4635                 EMIT_ONE_BYTE (c);
4636               }
4637             else
4638               {
4639                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4640
4641                 CHAR_STRING_ADVANCE (c, p1);
4642                 while (p0 < p1)
4643                   {
4644                     EMIT_ONE_BYTE (*p0);
4645                     p0++;
4646                   }
4647               }
4648           }
4649       else
4650         while (charbuf < charbuf_end)
4651           {
4652             ASSURE_DESTINATION (safe_room);
4653             c = *charbuf++;
4654             EMIT_ONE_BYTE (c);
4655           }
4656     }
4657   else
4658     {
4659       if (coding->src_multibyte)
4660         {
4661           int safe_room = MAX_MULTIBYTE_LENGTH;
4662
4663           while (charbuf < charbuf_end)
4664             {
4665               ASSURE_DESTINATION (safe_room);
4666               c = *charbuf++;
4667               if (ASCII_CHAR_P (c))
4668                 *dst++ = c;
4669               else if (CHAR_BYTE8_P (c))
4670                 *dst++ = CHAR_TO_BYTE8 (c);
4671               else
4672                 CHAR_STRING_ADVANCE (c, dst);
4673               produced_chars++;
4674             }
4675         }
4676       else
4677         {
4678           ASSURE_DESTINATION (charbuf_end - charbuf);
4679           while (charbuf < charbuf_end && dst < dst_end)
4680             *dst++ = *charbuf++;
4681           produced_chars = dst - (coding->destination + coding->dst_bytes);
4682         }
4683     }
4684   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4685   coding->produced_char += produced_chars;
4686   coding->produced = dst - coding->destination;
4687   return 0;
4688 }
4689
4690 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4691    Check if a text is encoded in a charset-based coding system.  If it
4692    is, return 1, else return 0.  */
4693
4694 static int
4695 detect_coding_charset (coding, detect_info)
4696      struct coding_system *coding;
4697      struct coding_detection_info *detect_info;
4698 {
4699   const unsigned char *src = coding->source, *src_base;
4700   const unsigned char *src_end = coding->source + coding->src_bytes;
4701   int multibytep = coding->src_multibyte;
4702   int consumed_chars = 0;
4703   Lisp_Object attrs, valids;
4704   int found = 0;
4705
4706   detect_info->checked |= CATEGORY_MASK_CHARSET;
4707
4708   coding = &coding_categories[coding_category_charset];
4709   attrs = CODING_ID_ATTRS (coding->id);
4710   valids = AREF (attrs, coding_attr_charset_valids);
4711
4712   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4713     src += coding->head_ascii;
4714
4715   while (1)
4716     {
4717       int c;
4718
4719       src_base = src;
4720       ONE_MORE_BYTE (c);
4721       if (c < 0)
4722         continue;
4723       if (NILP (AREF (valids, c)))
4724         break;
4725       if (c >= 0x80)
4726         found = CATEGORY_MASK_CHARSET;
4727     }
4728   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4729   return 0;
4730
4731  no_more_source:
4732   detect_info->found |= found;
4733   return 1;
4734 }
4735
4736 static void
4737 decode_coding_charset (coding)
4738      struct coding_system *coding;
4739 {
4740   const unsigned char *src = coding->source + coding->consumed;
4741   const unsigned char *src_end = coding->source + coding->src_bytes;
4742   const unsigned char *src_base;
4743   int *charbuf = coding->charbuf + coding->charbuf_used;
4744   int *charbuf_end
4745     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4746   int consumed_chars = 0, consumed_chars_base;
4747   int multibytep = coding->src_multibyte;
4748   Lisp_Object attrs, charset_list, valids;
4749   int char_offset = coding->produced_char;
4750   int last_offset = char_offset;
4751   int last_id = charset_ascii;
4752
4753   CODING_GET_INFO (coding, attrs, charset_list);
4754   valids = AREF (attrs, coding_attr_charset_valids);
4755
4756   while (1)
4757     {
4758       int c;
4759       Lisp_Object val;
4760       struct charset *charset;
4761       int dim;
4762       int len = 1;
4763       unsigned code;
4764
4765       src_base = src;
4766       consumed_chars_base = consumed_chars;
4767
4768       if (charbuf >= charbuf_end)
4769         break;
4770
4771       ONE_MORE_BYTE (c);
4772       if (c < 0)
4773         goto invalid_code;
4774       code = c;
4775
4776       val = AREF (valids, c);
4777       if (NILP (val))
4778         goto invalid_code;
4779       if (INTEGERP (val))
4780         {
4781           charset = CHARSET_FROM_ID (XFASTINT (val));
4782           dim = CHARSET_DIMENSION (charset);
4783           while (len < dim)
4784             {
4785               ONE_MORE_BYTE (c);
4786               code = (code << 8) | c;
4787               len++;
4788             }
4789           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4790                               charset, code, c);
4791         }
4792       else
4793         {
4794           /* VAL is a list of charset IDs.  It is assured that the
4795              list is sorted by charset dimensions (smaller one
4796              comes first).  */
4797           while (CONSP (val))
4798             {
4799               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4800               dim = CHARSET_DIMENSION (charset);
4801               while (len < dim)
4802                 {
4803                   ONE_MORE_BYTE (c);
4804                   code = (code << 8) | c;
4805                   len++;
4806                 }
4807               CODING_DECODE_CHAR (coding, src, src_base,
4808                                   src_end, charset, code, c);
4809               if (c >= 0)
4810                 break;
4811               val = XCDR (val);
4812             }
4813         }
4814       if (c < 0)
4815         goto invalid_code;
4816       if (charset->id != charset_ascii
4817           && last_id != charset->id)
4818         {
4819           if (last_id != charset_ascii)
4820             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4821           last_id = charset->id;
4822           last_offset = char_offset;
4823         }
4824
4825       *charbuf++ = c;
4826       char_offset++;
4827       continue;
4828
4829     invalid_code:
4830       src = src_base;
4831       consumed_chars = consumed_chars_base;
4832       ONE_MORE_BYTE (c);
4833       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4834       char_offset++;
4835       coding->errors++;
4836     }
4837
4838  no_more_source:
4839   if (last_id != charset_ascii)
4840     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4841   coding->consumed_char += consumed_chars_base;
4842   coding->consumed = src_base - coding->source;
4843   coding->charbuf_used = charbuf - coding->charbuf;
4844 }
4845
4846 static int
4847 encode_coding_charset (coding)
4848      struct coding_system *coding;
4849 {
4850   int multibytep = coding->dst_multibyte;
4851   int *charbuf = coding->charbuf;
4852   int *charbuf_end = charbuf + coding->charbuf_used;
4853   unsigned char *dst = coding->destination + coding->produced;
4854   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4855   int safe_room = MAX_MULTIBYTE_LENGTH;
4856   int produced_chars = 0;
4857   Lisp_Object attrs, charset_list;
4858   int ascii_compatible;
4859   int c;
4860
4861   CODING_GET_INFO (coding, attrs, charset_list);
4862   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4863
4864   while (charbuf < charbuf_end)
4865     {
4866       struct charset *charset;
4867       unsigned code;
4868
4869       ASSURE_DESTINATION (safe_room);
4870       c = *charbuf++;
4871       if (ascii_compatible && ASCII_CHAR_P (c))
4872         EMIT_ONE_ASCII_BYTE (c);
4873       else if (CHAR_BYTE8_P (c))
4874         {
4875           c = CHAR_TO_BYTE8 (c);
4876           EMIT_ONE_BYTE (c);
4877         }
4878       else
4879         {
4880           charset = char_charset (c, charset_list, &code);
4881           if (charset)
4882             {
4883               if (CHARSET_DIMENSION (charset) == 1)
4884                 EMIT_ONE_BYTE (code);
4885               else if (CHARSET_DIMENSION (charset) == 2)
4886                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4887               else if (CHARSET_DIMENSION (charset) == 3)
4888                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4889               else
4890                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4891                                  (code >> 8) & 0xFF, code & 0xFF);
4892             }
4893           else
4894             {
4895               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4896                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4897               else
4898                 c = coding->default_char;
4899               EMIT_ONE_BYTE (c);
4900             }
4901         }
4902     }
4903
4904   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4905   coding->produced_char += produced_chars;
4906   coding->produced = dst - coding->destination;
4907   return 0;
4908 }
4909
4910 \f
4911 /*** 7. C library functions ***/
4912
4913 /* Setup coding context CODING from information about CODING_SYSTEM.
4914    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4915    CODING_SYSTEM is invalid, signal an error.  */
4916
4917 void
4918 setup_coding_system (coding_system, coding)
4919      Lisp_Object coding_system;
4920      struct coding_system *coding;
4921 {
4922   Lisp_Object attrs;
4923   Lisp_Object eol_type;
4924   Lisp_Object coding_type;
4925   Lisp_Object val;
4926
4927   if (NILP (coding_system))
4928     coding_system = Qundecided;
4929
4930   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4931
4932   attrs = CODING_ID_ATTRS (coding->id);
4933   eol_type = CODING_ID_EOL_TYPE (coding->id);
4934
4935   coding->mode = 0;
4936   coding->head_ascii = -1;
4937   coding->common_flags
4938     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4939   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4940     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4941   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4942     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4943   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4944     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4945
4946   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4947   coding->max_charset_id = SCHARS (val) - 1;
4948   coding->safe_charsets = (char *) SDATA (val);
4949   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4950
4951   coding_type = CODING_ATTR_TYPE (attrs);
4952   if (EQ (coding_type, Qundecided))
4953     {
4954       coding->detector = NULL;
4955       coding->decoder = decode_coding_raw_text;
4956       coding->encoder = encode_coding_raw_text;
4957       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4958     }
4959   else if (EQ (coding_type, Qiso_2022))
4960     {
4961       int i;
4962       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4963
4964       /* Invoke graphic register 0 to plane 0.  */
4965       CODING_ISO_INVOCATION (coding, 0) = 0;
4966       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4967       CODING_ISO_INVOCATION (coding, 1)
4968         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4969       /* Setup the initial status of designation.  */
4970       for (i = 0; i < 4; i++)
4971         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4972       /* Not single shifting initially.  */
4973       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4974       /* Beginning of buffer should also be regarded as bol. */
4975       CODING_ISO_BOL (coding) = 1;
4976       coding->detector = detect_coding_iso_2022;
4977       coding->decoder = decode_coding_iso_2022;
4978       coding->encoder = encode_coding_iso_2022;
4979       if (flags & CODING_ISO_FLAG_SAFE)
4980         coding->mode |= CODING_MODE_SAFE_ENCODING;
4981       coding->common_flags
4982         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4983             | CODING_REQUIRE_FLUSHING_MASK);
4984       if (flags & CODING_ISO_FLAG_COMPOSITION)
4985         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4986       if (flags & CODING_ISO_FLAG_DESIGNATION)
4987         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4988       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4989         {
4990           setup_iso_safe_charsets (attrs);
4991           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4992           coding->max_charset_id = SCHARS (val) - 1;
4993           coding->safe_charsets = (char *) SDATA (val);
4994         }
4995       CODING_ISO_FLAGS (coding) = flags;
4996     }
4997   else if (EQ (coding_type, Qcharset))
4998     {
4999       coding->detector = detect_coding_charset;
5000       coding->decoder = decode_coding_charset;
5001       coding->encoder = encode_coding_charset;
5002       coding->common_flags
5003         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5004     }
5005   else if (EQ (coding_type, Qutf_8))
5006     {
5007       coding->detector = detect_coding_utf_8;
5008       coding->decoder = decode_coding_utf_8;
5009       coding->encoder = encode_coding_utf_8;
5010       coding->common_flags
5011         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5012     }
5013   else if (EQ (coding_type, Qutf_16))
5014     {
5015       val = AREF (attrs, coding_attr_utf_16_bom);
5016       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5017                                     : EQ (val, Qt) ? utf_16_with_bom
5018                                     : utf_16_without_bom);
5019       val = AREF (attrs, coding_attr_utf_16_endian);
5020       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5021                                        : utf_16_little_endian);
5022       CODING_UTF_16_SURROGATE (coding) = 0;
5023       coding->detector = detect_coding_utf_16;
5024       coding->decoder = decode_coding_utf_16;
5025       coding->encoder = encode_coding_utf_16;
5026       coding->common_flags
5027         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5028       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5029         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5030     }
5031   else if (EQ (coding_type, Qccl))
5032     {
5033       coding->detector = detect_coding_ccl;
5034       coding->decoder = decode_coding_ccl;
5035       coding->encoder = encode_coding_ccl;
5036       coding->common_flags
5037         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5038             | CODING_REQUIRE_FLUSHING_MASK);
5039     }
5040   else if (EQ (coding_type, Qemacs_mule))
5041     {
5042       coding->detector = detect_coding_emacs_mule;
5043       coding->decoder = decode_coding_emacs_mule;
5044       coding->encoder = encode_coding_emacs_mule;
5045       coding->common_flags
5046         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5047       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5048           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5049         {
5050           Lisp_Object tail, safe_charsets;
5051           int max_charset_id = 0;
5052
5053           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5054                tail = XCDR (tail))
5055             if (max_charset_id < XFASTINT (XCAR (tail)))
5056               max_charset_id = XFASTINT (XCAR (tail));
5057           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5058                                         make_number (255));
5059           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5060                tail = XCDR (tail))
5061             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5062           coding->max_charset_id = max_charset_id;
5063           coding->safe_charsets = (char *) SDATA (safe_charsets);
5064         }
5065     }
5066   else if (EQ (coding_type, Qshift_jis))
5067     {
5068       coding->detector = detect_coding_sjis;
5069       coding->decoder = decode_coding_sjis;
5070       coding->encoder = encode_coding_sjis;
5071       coding->common_flags
5072         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5073     }
5074   else if (EQ (coding_type, Qbig5))
5075     {
5076       coding->detector = detect_coding_big5;
5077       coding->decoder = decode_coding_big5;
5078       coding->encoder = encode_coding_big5;
5079       coding->common_flags
5080         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5081     }
5082   else                          /* EQ (coding_type, Qraw_text) */
5083     {
5084       coding->detector = NULL;
5085       coding->decoder = decode_coding_raw_text;
5086       coding->encoder = encode_coding_raw_text;
5087       if (! EQ (eol_type, Qunix))
5088         {
5089           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5090           if (! VECTORP (eol_type))
5091             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5092         }
5093
5094     }
5095
5096   return;
5097 }
5098
5099 /* Return a list of charsets supported by CODING.  */
5100
5101 Lisp_Object
5102 coding_charset_list (coding)
5103      struct coding_system *coding;
5104 {
5105   Lisp_Object attrs, charset_list;
5106
5107   CODING_GET_INFO (coding, attrs, charset_list);
5108   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5109     {
5110       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5111
5112       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5113         charset_list = Viso_2022_charset_list;
5114     }
5115   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5116     {
5117       charset_list = Vemacs_mule_charset_list;
5118     }
5119   return charset_list;
5120 }
5121
5122
5123 /* Return raw-text or one of its subsidiaries that has the same
5124    eol_type as CODING-SYSTEM.  */
5125
5126 Lisp_Object
5127 raw_text_coding_system (coding_system)
5128      Lisp_Object coding_system;
5129 {
5130   Lisp_Object spec, attrs;
5131   Lisp_Object eol_type, raw_text_eol_type;
5132
5133   if (NILP (coding_system))
5134     return Qraw_text;
5135   spec = CODING_SYSTEM_SPEC (coding_system);
5136   attrs = AREF (spec, 0);
5137
5138   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5139     return coding_system;
5140
5141   eol_type = AREF (spec, 2);
5142   if (VECTORP (eol_type))
5143     return Qraw_text;
5144   spec = CODING_SYSTEM_SPEC (Qraw_text);
5145   raw_text_eol_type = AREF (spec, 2);
5146   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5147           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5148           : AREF (raw_text_eol_type, 2));
5149 }
5150
5151
5152 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5153    does, return one of the subsidiary that has the same eol-spec as
5154    PARENT.  Otherwise, return CODING_SYSTEM.  */
5155
5156 Lisp_Object
5157 coding_inherit_eol_type (coding_system, parent)
5158      Lisp_Object coding_system, parent;
5159 {
5160   Lisp_Object spec, eol_type;
5161
5162   if (NILP (coding_system))
5163     coding_system = Qraw_text;
5164   spec = CODING_SYSTEM_SPEC (coding_system);
5165   eol_type = AREF (spec, 2);
5166   if (VECTORP (eol_type)
5167       && ! NILP (parent))
5168     {
5169       Lisp_Object parent_spec;
5170       Lisp_Object parent_eol_type;
5171
5172       parent_spec
5173         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5174       parent_eol_type = AREF (parent_spec, 2);
5175       if (EQ (parent_eol_type, Qunix))
5176         coding_system = AREF (eol_type, 0);
5177       else if (EQ (parent_eol_type, Qdos))
5178         coding_system = AREF (eol_type, 1);
5179       else if (EQ (parent_eol_type, Qmac))
5180         coding_system = AREF (eol_type, 2);
5181     }
5182   return coding_system;
5183 }
5184
5185 /* Emacs has a mechanism to automatically detect a coding system if it
5186    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5187    it's impossible to distinguish some coding systems accurately
5188    because they use the same range of codes.  So, at first, coding
5189    systems are categorized into 7, those are:
5190
5191    o coding-category-emacs-mule
5192
5193         The category for a coding system which has the same code range
5194         as Emacs' internal format.  Assigned the coding-system (Lisp
5195         symbol) `emacs-mule' by default.
5196
5197    o coding-category-sjis
5198
5199         The category for a coding system which has the same code range
5200         as SJIS.  Assigned the coding-system (Lisp
5201         symbol) `japanese-shift-jis' by default.
5202
5203    o coding-category-iso-7
5204
5205         The category for a coding system which has the same code range
5206         as ISO2022 of 7-bit environment.  This doesn't use any locking
5207         shift and single shift functions.  This can encode/decode all
5208         charsets.  Assigned the coding-system (Lisp symbol)
5209         `iso-2022-7bit' by default.
5210
5211    o coding-category-iso-7-tight
5212
5213         Same as coding-category-iso-7 except that this can
5214         encode/decode only the specified charsets.
5215
5216    o coding-category-iso-8-1
5217
5218         The category for a coding system which has the same code range
5219         as ISO2022 of 8-bit environment and graphic plane 1 used only
5220         for DIMENSION1 charset.  This doesn't use any locking shift
5221         and single shift functions.  Assigned the coding-system (Lisp
5222         symbol) `iso-latin-1' by default.
5223
5224    o coding-category-iso-8-2
5225
5226         The category for a coding system which has the same code range
5227         as ISO2022 of 8-bit environment and graphic plane 1 used only
5228         for DIMENSION2 charset.  This doesn't use any locking shift
5229         and single shift functions.  Assigned the coding-system (Lisp
5230         symbol) `japanese-iso-8bit' by default.
5231
5232    o coding-category-iso-7-else
5233
5234         The category for a coding system which has the same code range
5235         as ISO2022 of 7-bit environemnt but uses locking shift or
5236         single shift functions.  Assigned the coding-system (Lisp
5237         symbol) `iso-2022-7bit-lock' by default.
5238
5239    o coding-category-iso-8-else
5240
5241         The category for a coding system which has the same code range
5242         as ISO2022 of 8-bit environemnt but uses locking shift or
5243         single shift functions.  Assigned the coding-system (Lisp
5244         symbol) `iso-2022-8bit-ss2' by default.
5245
5246    o coding-category-big5
5247
5248         The category for a coding system which has the same code range
5249         as BIG5.  Assigned the coding-system (Lisp symbol)
5250         `cn-big5' by default.
5251
5252    o coding-category-utf-8
5253
5254         The category for a coding system which has the same code range
5255         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5256         symbol) `utf-8' by default.
5257
5258    o coding-category-utf-16-be
5259
5260         The category for a coding system in which a text has an
5261         Unicode signature (cf. Unicode Standard) in the order of BIG
5262         endian at the head.  Assigned the coding-system (Lisp symbol)
5263         `utf-16-be' by default.
5264
5265    o coding-category-utf-16-le
5266
5267         The category for a coding system in which a text has an
5268         Unicode signature (cf. Unicode Standard) in the order of
5269         LITTLE endian at the head.  Assigned the coding-system (Lisp
5270         symbol) `utf-16-le' by default.
5271
5272    o coding-category-ccl
5273
5274         The category for a coding system of which encoder/decoder is
5275         written in CCL programs.  The default value is nil, i.e., no
5276         coding system is assigned.
5277
5278    o coding-category-binary
5279
5280         The category for a coding system not categorized in any of the
5281         above.  Assigned the coding-system (Lisp symbol)
5282         `no-conversion' by default.
5283
5284    Each of them is a Lisp symbol and the value is an actual
5285    `coding-system's (this is also a Lisp symbol) assigned by a user.
5286    What Emacs does actually is to detect a category of coding system.
5287    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5288    decide only one possible category, it selects a category of the
5289    highest priority.  Priorities of categories are also specified by a
5290    user in a Lisp variable `coding-category-list'.
5291
5292 */
5293
5294 #define EOL_SEEN_NONE   0
5295 #define EOL_SEEN_LF     1
5296 #define EOL_SEEN_CR     2
5297 #define EOL_SEEN_CRLF   4
5298
5299 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5300    SOURCE is encoded.  If CATEGORY is one of
5301    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5302    two-byte, else they are encoded by one-byte.
5303
5304    Return one of EOL_SEEN_XXX.  */
5305
5306 #define MAX_EOL_CHECK_COUNT 3
5307
5308 static int
5309 detect_eol (source, src_bytes, category)
5310      const unsigned char *source;
5311      EMACS_INT src_bytes;
5312      enum coding_category category;
5313 {
5314   const unsigned char *src = source, *src_end = src + src_bytes;
5315   unsigned char c;
5316   int total  = 0;
5317   int eol_seen = EOL_SEEN_NONE;
5318
5319   if ((1 << category) & CATEGORY_MASK_UTF_16)
5320     {
5321       int msb, lsb;
5322
5323       msb = category == (coding_category_utf_16_le
5324                          | coding_category_utf_16_le_nosig);
5325       lsb = 1 - msb;
5326
5327       while (src + 1 < src_end)
5328         {
5329           c = src[lsb];
5330           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5331             {
5332               int this_eol;
5333
5334               if (c == '\n')
5335                 this_eol = EOL_SEEN_LF;
5336               else if (src + 3 >= src_end
5337                        || src[msb + 2] != 0
5338                        || src[lsb + 2] != '\n')
5339                 this_eol = EOL_SEEN_CR;
5340               else
5341                 this_eol = EOL_SEEN_CRLF;
5342
5343               if (eol_seen == EOL_SEEN_NONE)
5344                 /* This is the first end-of-line.  */
5345                 eol_seen = this_eol;
5346               else if (eol_seen != this_eol)
5347                 {
5348                   /* The found type is different from what found before.  */
5349                   eol_seen = EOL_SEEN_LF;
5350                   break;
5351                 }
5352               if (++total == MAX_EOL_CHECK_COUNT)
5353                 break;
5354             }
5355           src += 2;
5356         }
5357     }
5358   else
5359     {
5360       while (src < src_end)
5361         {
5362           c = *src++;
5363           if (c == '\n' || c == '\r')
5364             {
5365               int this_eol;
5366
5367               if (c == '\n')
5368                 this_eol = EOL_SEEN_LF;
5369               else if (src >= src_end || *src != '\n')
5370                 this_eol = EOL_SEEN_CR;
5371               else
5372                 this_eol = EOL_SEEN_CRLF, src++;
5373
5374               if (eol_seen == EOL_SEEN_NONE)
5375                 /* This is the first end-of-line.  */
5376                 eol_seen = this_eol;
5377               else if (eol_seen != this_eol)
5378                 {
5379                   /* The found type is different from what found before.  */
5380                   eol_seen = EOL_SEEN_LF;
5381                   break;
5382                 }
5383               if (++total == MAX_EOL_CHECK_COUNT)
5384                 break;
5385             }
5386         }
5387     }
5388   return eol_seen;
5389 }
5390
5391
5392 static Lisp_Object
5393 adjust_coding_eol_type (coding, eol_seen)
5394      struct coding_system *coding;
5395      int eol_seen;
5396 {
5397   Lisp_Object eol_type;
5398
5399   eol_type = CODING_ID_EOL_TYPE (coding->id);
5400   if (eol_seen & EOL_SEEN_LF)
5401     {
5402       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5403       eol_type = Qunix;
5404     }
5405   else if (eol_seen & EOL_SEEN_CRLF)
5406     {
5407       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5408       eol_type = Qdos;
5409     }
5410   else if (eol_seen & EOL_SEEN_CR)
5411     {
5412       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5413       eol_type = Qmac;
5414     }
5415   return eol_type;
5416 }
5417
5418 /* Detect how a text specified in CODING is encoded.  If a coding
5419    system is detected, update fields of CODING by the detected coding
5420    system.  */
5421
5422 void
5423 detect_coding (coding)
5424      struct coding_system *coding;
5425 {
5426   const unsigned char *src, *src_end;
5427
5428   coding->consumed = coding->consumed_char = 0;
5429   coding->produced = coding->produced_char = 0;
5430   coding_set_source (coding);
5431
5432   src_end = coding->source + coding->src_bytes;
5433
5434   /* If we have not yet decided the text encoding type, detect it
5435      now.  */
5436   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5437     {
5438       int c, i;
5439       struct coding_detection_info detect_info;
5440
5441       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5442       for (i = 0, src = coding->source; src < src_end; i++, src++)
5443         {
5444           c = *src;
5445           if (c & 0x80)
5446             break;
5447           if (c < 0x20
5448               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5449               && ! inhibit_iso_escape_detection
5450               && ! detect_info.checked)
5451             {
5452               coding->head_ascii = src - (coding->source + coding->consumed);
5453               if (detect_coding_iso_2022 (coding, &detect_info))
5454                 {
5455                   /* We have scanned the whole data.  */
5456                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5457                     /* We didn't find an 8-bit code.  */
5458                     src = src_end;
5459                   break;
5460                 }
5461             }
5462         }
5463       coding->head_ascii = src - (coding->source + coding->consumed);
5464
5465       if (coding->head_ascii < coding->src_bytes
5466           || detect_info.found)
5467         {
5468           enum coding_category category;
5469           struct coding_system *this;
5470
5471           if (coding->head_ascii == coding->src_bytes)
5472             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5473             for (i = 0; i < coding_category_raw_text; i++)
5474               {
5475                 category = coding_priorities[i];
5476                 this = coding_categories + category;
5477                 if (detect_info.found & (1 << category))
5478                   break;
5479               }
5480           else
5481             for (i = 0; i < coding_category_raw_text; i++)
5482               {
5483                 category = coding_priorities[i];
5484                 this = coding_categories + category;
5485                 if (this->id < 0)
5486                   {
5487                     /* No coding system of this category is defined.  */
5488                     detect_info.rejected |= (1 << category);
5489                   }
5490                 else if (category >= coding_category_raw_text)
5491                   continue;
5492                 else if (detect_info.checked & (1 << category))
5493                   {
5494                     if (detect_info.found & (1 << category))
5495                       break;
5496                   }
5497                 else if ((*(this->detector)) (coding, &detect_info)
5498                          && detect_info.found & (1 << category))
5499                   {
5500                     if (category == coding_category_utf_16_auto)
5501                       {
5502                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5503                           category = coding_category_utf_16_le;
5504                         else
5505                           category = coding_category_utf_16_be;
5506                       }
5507                     break;
5508                   }
5509               }
5510
5511           if (i < coding_category_raw_text)
5512             setup_coding_system (CODING_ID_NAME (this->id), coding);
5513           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5514             setup_coding_system (Qraw_text, coding);
5515           else if (detect_info.rejected)
5516             for (i = 0; i < coding_category_raw_text; i++)
5517               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5518                 {
5519                   this = coding_categories + coding_priorities[i];
5520                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5521                   break;
5522                 }
5523         }
5524     }
5525   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5526            == coding_category_utf_16_auto)
5527     {
5528       Lisp_Object coding_systems;
5529       struct coding_detection_info detect_info;
5530
5531       coding_systems
5532         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5533       detect_info.found = detect_info.rejected = 0;
5534       if (CONSP (coding_systems)
5535           && detect_coding_utf_16 (coding, &detect_info))
5536         {
5537           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5538             setup_coding_system (XCAR (coding_systems), coding);
5539           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5540             setup_coding_system (XCDR (coding_systems), coding);
5541         }
5542     }
5543 }
5544
5545
5546 static void
5547 decode_eol (coding)
5548      struct coding_system *coding;
5549 {
5550   Lisp_Object eol_type;
5551   unsigned char *p, *pbeg, *pend;
5552
5553   eol_type = CODING_ID_EOL_TYPE (coding->id);
5554   if (EQ (eol_type, Qunix))
5555     return;
5556
5557   if (NILP (coding->dst_object))
5558     pbeg = coding->destination;
5559   else
5560     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5561   pend = pbeg + coding->produced;
5562
5563   if (VECTORP (eol_type))
5564     {
5565       int eol_seen = EOL_SEEN_NONE;
5566
5567       for (p = pbeg; p < pend; p++)
5568         {
5569           if (*p == '\n')
5570             eol_seen |= EOL_SEEN_LF;
5571           else if (*p == '\r')
5572             {
5573               if (p + 1 < pend && *(p + 1) == '\n')
5574                 {
5575                   eol_seen |= EOL_SEEN_CRLF;
5576                   p++;
5577                 }
5578               else
5579                 eol_seen |= EOL_SEEN_CR;
5580             }
5581         }
5582       if (eol_seen != EOL_SEEN_NONE
5583           && eol_seen != EOL_SEEN_LF
5584           && eol_seen != EOL_SEEN_CRLF
5585           && eol_seen != EOL_SEEN_CR)
5586         eol_seen = EOL_SEEN_LF;
5587       if (eol_seen != EOL_SEEN_NONE)
5588         eol_type = adjust_coding_eol_type (coding, eol_seen);
5589     }
5590
5591   if (EQ (eol_type, Qmac))
5592     {
5593       for (p = pbeg; p < pend; p++)
5594         if (*p == '\r')
5595           *p = '\n';
5596     }
5597   else if (EQ (eol_type, Qdos))
5598     {
5599       int n = 0;
5600
5601       if (NILP (coding->dst_object))
5602         {
5603           for (p = pend - 2; p >= pbeg; p--)
5604             if (*p == '\r')
5605               {
5606                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5607                 n++;
5608               }
5609         }
5610       else
5611         {
5612           for (p = pend - 2; p >= pbeg; p--)
5613             if (*p == '\r')
5614               {
5615                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5616                 int pos = BYTE_TO_CHAR (pos_byte);
5617
5618                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5619                 n++;
5620               }
5621         }
5622       coding->produced -= n;
5623       coding->produced_char -= n;
5624     }
5625 }
5626
5627
5628 /* Return a translation table (or list of them) from coding system
5629    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5630    decoding (ENCODEP is zero). */
5631
5632 static Lisp_Object
5633 get_translation_table (attrs, encodep, max_lookup)
5634      Lisp_Object attrs;
5635      int encodep, *max_lookup;
5636 {
5637   Lisp_Object standard, translation_table;
5638   Lisp_Object val;
5639
5640   if (encodep)
5641     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5642       standard = Vstandard_translation_table_for_encode;
5643   else
5644     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5645       standard = Vstandard_translation_table_for_decode;
5646   if (NILP (translation_table))
5647     translation_table = standard;
5648   else
5649     {
5650       if (SYMBOLP (translation_table))
5651         translation_table = Fget (translation_table, Qtranslation_table);
5652       else if (CONSP (translation_table))
5653         {
5654           translation_table = Fcopy_sequence (translation_table);
5655           for (val = translation_table; CONSP (val); val = XCDR (val))
5656             if (SYMBOLP (XCAR (val)))
5657               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5658         }
5659       if (CHAR_TABLE_P (standard))
5660         {
5661           if (CONSP (translation_table))
5662             translation_table = nconc2 (translation_table,
5663                                         Fcons (standard, Qnil));
5664           else
5665             translation_table = Fcons (translation_table,
5666                                        Fcons (standard, Qnil));
5667         }
5668     }
5669
5670   if (max_lookup)
5671     {
5672       *max_lookup = 1;
5673       if (CHAR_TABLE_P (translation_table)
5674           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5675         {
5676           val = XCHAR_TABLE (translation_table)->extras[1];
5677           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5678             *max_lookup = XFASTINT (val);
5679         }
5680       else if (CONSP (translation_table))
5681         {
5682           Lisp_Object tail, val;
5683
5684           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5685             if (CHAR_TABLE_P (XCAR (tail))
5686                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5687               {
5688                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5689                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5690                   *max_lookup = XFASTINT (val);
5691               }
5692         }
5693     }
5694   return translation_table;
5695 }
5696
5697 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5698   do {                                                          \
5699     trans = Qnil;                                               \
5700     if (CHAR_TABLE_P (table))                                   \
5701       {                                                         \
5702         trans = CHAR_TABLE_REF (table, c);                      \
5703         if (CHARACTERP (trans))                                 \
5704           c = XFASTINT (trans), trans = Qnil;                   \
5705       }                                                         \
5706     else if (CONSP (table))                                     \
5707       {                                                         \
5708         Lisp_Object tail;                                       \
5709                                                                 \
5710         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5711           if (CHAR_TABLE_P (XCAR (tail)))                       \
5712             {                                                   \
5713               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5714               if (CHARACTERP (trans))                           \
5715                 c = XFASTINT (trans), trans = Qnil;             \
5716               else if (! NILP (trans))                          \
5717                 break;                                          \
5718             }                                                   \
5719       }                                                         \
5720   } while (0)
5721
5722
5723 static Lisp_Object
5724 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5725      Lisp_Object val;
5726      int *buf, *buf_end;
5727      int last_block;
5728      int *from_nchars, *to_nchars;
5729 {
5730   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5731      [TO-CHAR ...].  */
5732   if (CONSP (val))
5733     {
5734       Lisp_Object from, tail;
5735       int i, len;
5736
5737       for (tail = val; CONSP (tail); tail = XCDR (tail))
5738         {
5739           val = XCAR (tail);
5740           from = XCAR (val);
5741           len = ASIZE (from);
5742           for (i = 0; i < len; i++)
5743             {
5744               if (buf + i == buf_end)
5745                 {
5746                   if (! last_block)
5747                     return Qt;
5748                   break;
5749                 }
5750               if (XINT (AREF (from, i)) != buf[i])
5751                 break;
5752             }
5753           if (i == len)
5754             {
5755               val = XCDR (val);
5756               *from_nchars = len;
5757               break;
5758             }
5759         }
5760       if (! CONSP (tail))
5761         return Qnil;
5762     }
5763   if (VECTORP (val))
5764     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5765   else
5766     *buf = XINT (val);
5767   return val;
5768 }
5769
5770
5771 static int
5772 produce_chars (coding, translation_table, last_block)
5773      struct coding_system *coding;
5774      Lisp_Object translation_table;
5775      int last_block;
5776 {
5777   unsigned char *dst = coding->destination + coding->produced;
5778   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5779   int produced;
5780   int produced_chars = 0;
5781   int carryover = 0;
5782
5783   if (! coding->chars_at_source)
5784     {
5785       /* Characters are in coding->charbuf.  */
5786       int *buf = coding->charbuf;
5787       int *buf_end = buf + coding->charbuf_used;
5788
5789       if (BUFFERP (coding->src_object)
5790           && EQ (coding->src_object, coding->dst_object))
5791         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5792
5793       while (buf < buf_end)
5794         {
5795           int c = *buf, i;
5796
5797           if (c >= 0)
5798             {
5799               int from_nchars = 1, to_nchars = 1;
5800               Lisp_Object trans = Qnil;
5801
5802               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5803               if (! NILP (trans))
5804                 {
5805                   trans = get_translation (trans, buf, buf_end, last_block,
5806                                            &from_nchars, &to_nchars);
5807                   if (EQ (trans, Qt))
5808                     break;
5809                   c = *buf;
5810                 }
5811
5812               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5813                 {
5814                   dst = alloc_destination (coding,
5815                                            buf_end - buf
5816                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5817                                            dst);
5818                   dst_end = coding->destination + coding->dst_bytes;
5819                 }
5820
5821               for (i = 0; i < to_nchars; i++)
5822                 {
5823                   if (i > 0)
5824                     c = XINT (AREF (trans, i));
5825                   if (coding->dst_multibyte
5826                       || ! CHAR_BYTE8_P (c))
5827                     CHAR_STRING_ADVANCE (c, dst);
5828                   else
5829                     *dst++ = CHAR_TO_BYTE8 (c);
5830                 }
5831               produced_chars += to_nchars;
5832               *buf++ = to_nchars;
5833               while (--from_nchars > 0)
5834                 *buf++ = 0;
5835             }
5836           else
5837             /* This is an annotation datum.  (-C) is the length.  */
5838             buf += -c;
5839         }
5840       carryover = buf_end - buf;
5841     }
5842   else
5843     {
5844       const unsigned char *src = coding->source;
5845       const unsigned char *src_end = src + coding->src_bytes;
5846       Lisp_Object eol_type;
5847
5848       eol_type = CODING_ID_EOL_TYPE (coding->id);
5849
5850       if (coding->src_multibyte != coding->dst_multibyte)
5851         {
5852           if (coding->src_multibyte)
5853             {
5854               int multibytep = 1;
5855               int consumed_chars;
5856
5857               while (1)
5858                 {
5859                   const unsigned char *src_base = src;
5860                   int c;
5861
5862                   ONE_MORE_BYTE (c);
5863                   if (c == '\r')
5864                     {
5865                       if (EQ (eol_type, Qdos))
5866                         {
5867                           if (src == src_end)
5868                             {
5869                               record_conversion_result
5870                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5871                               goto no_more_source;
5872                             }
5873                           if (*src == '\n')
5874                             c = *src++;
5875                         }
5876                       else if (EQ (eol_type, Qmac))
5877                         c = '\n';
5878                     }
5879                   if (dst == dst_end)
5880                     {
5881                       coding->consumed = src - coding->source;
5882
5883                     if (EQ (coding->src_object, coding->dst_object))
5884                       dst_end = (unsigned char *) src;
5885                     if (dst == dst_end)
5886                       {
5887                         dst = alloc_destination (coding, src_end - src + 1,
5888                                                  dst);
5889                         dst_end = coding->destination + coding->dst_bytes;
5890                         coding_set_source (coding);
5891                         src = coding->source + coding->consumed;
5892                         src_end = coding->source + coding->src_bytes;
5893                       }
5894                     }
5895                   *dst++ = c;
5896                   produced_chars++;
5897                 }
5898             no_more_source:
5899               ;
5900             }
5901           else
5902             while (src < src_end)
5903               {
5904                 int multibytep = 1;
5905                 int c = *src++;
5906
5907                 if (c == '\r')
5908                   {
5909                     if (EQ (eol_type, Qdos))
5910                       {
5911                         if (src < src_end
5912                             && *src == '\n')
5913                           c = *src++;
5914                       }
5915                     else if (EQ (eol_type, Qmac))
5916                       c = '\n';
5917                   }
5918                 if (dst >= dst_end - 1)
5919                   {
5920                     coding->consumed = src - coding->source;
5921
5922                     if (EQ (coding->src_object, coding->dst_object))
5923                       dst_end = (unsigned char *) src;
5924                     if (dst >= dst_end - 1)
5925                       {
5926                         dst = alloc_destination (coding, src_end - src + 2,
5927                                                  dst);
5928                         dst_end = coding->destination + coding->dst_bytes;
5929                         coding_set_source (coding);
5930                         src = coding->source + coding->consumed;
5931                         src_end = coding->source + coding->src_bytes;
5932                       }
5933                   }
5934                 EMIT_ONE_BYTE (c);
5935               }
5936         }
5937       else
5938         {
5939           if (!EQ (coding->src_object, coding->dst_object))
5940             {
5941               int require = coding->src_bytes - coding->dst_bytes;
5942
5943               if (require > 0)
5944                 {
5945                   EMACS_INT offset = src - coding->source;
5946
5947                   dst = alloc_destination (coding, require, dst);
5948                   coding_set_source (coding);
5949                   src = coding->source + offset;
5950                   src_end = coding->source + coding->src_bytes;
5951                 }
5952             }
5953           produced_chars = coding->src_chars;
5954           while (src < src_end)
5955             {
5956               int c = *src++;
5957
5958               if (c == '\r')
5959                 {
5960                   if (EQ (eol_type, Qdos))
5961                     {
5962                       if (src < src_end
5963                           && *src == '\n')
5964                         c = *src++;
5965                       produced_chars--;
5966                     }
5967                   else if (EQ (eol_type, Qmac))
5968                     c = '\n';
5969                 }
5970               *dst++ = c;
5971             }
5972         }
5973       coding->consumed = coding->src_bytes;
5974       coding->consumed_char = coding->src_chars;
5975     }
5976
5977   produced = dst - (coding->destination + coding->produced);
5978   if (BUFFERP (coding->dst_object))
5979     insert_from_gap (produced_chars, produced);
5980   coding->produced += produced;
5981   coding->produced_char += produced_chars;
5982   return carryover;
5983 }
5984
5985 /* Compose text in CODING->object according to the annotation data at
5986    CHARBUF.  CHARBUF is an array:
5987      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5988  */
5989
5990 static INLINE void
5991 produce_composition (coding, charbuf, pos)
5992      struct coding_system *coding;
5993      int *charbuf;
5994      EMACS_INT pos;
5995 {
5996   int len;
5997   EMACS_INT to;
5998   enum composition_method method;
5999   Lisp_Object components;
6000
6001   len = -charbuf[0];
6002   to = pos + charbuf[2];
6003   if (to <= pos)
6004     return;
6005   method = (enum composition_method) (charbuf[3]);
6006
6007   if (method == COMPOSITION_RELATIVE)
6008     components = Qnil;
6009   else if (method >= COMPOSITION_WITH_RULE
6010            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6011     {
6012       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6013       int i;
6014
6015       len -= 4;
6016       charbuf += 4;
6017       for (i = 0; i < len; i++)
6018         {
6019           args[i] = make_number (charbuf[i]);
6020           if (args[i] < 0)
6021             return;
6022         }
6023       components = (method == COMPOSITION_WITH_ALTCHARS
6024                     ? Fstring (len, args) : Fvector (len, args));
6025     }
6026   else
6027     return;
6028   compose_text (pos, to, components, Qnil, coding->dst_object);
6029 }
6030
6031
6032 /* Put `charset' property on text in CODING->object according to
6033    the annotation data at CHARBUF.  CHARBUF is an array:
6034      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6035  */
6036
6037 static INLINE void
6038 produce_charset (coding, charbuf, pos)
6039      struct coding_system *coding;
6040      int *charbuf;
6041      EMACS_INT pos;
6042 {
6043   EMACS_INT from = pos - charbuf[2];
6044   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6045
6046   Fput_text_property (make_number (from), make_number (pos),
6047                       Qcharset, CHARSET_NAME (charset),
6048                       coding->dst_object);
6049 }
6050
6051
6052 #define CHARBUF_SIZE 0x4000
6053
6054 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6055   do {                                                                  \
6056     int size = CHARBUF_SIZE;;                                           \
6057                                                                         \
6058     coding->charbuf = NULL;                                             \
6059     while (size > 1024)                                                 \
6060       {                                                                 \
6061         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6062         if (coding->charbuf)                                            \
6063           break;                                                        \
6064         size >>= 1;                                                     \
6065       }                                                                 \
6066     if (! coding->charbuf)                                              \
6067       {                                                                 \
6068         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6069         return coding->result;                                          \
6070       }                                                                 \
6071     coding->charbuf_size = size;                                        \
6072   } while (0)
6073
6074
6075 static void
6076 produce_annotation (coding, pos)
6077      struct coding_system *coding;
6078      EMACS_INT pos;
6079 {
6080   int *charbuf = coding->charbuf;
6081   int *charbuf_end = charbuf + coding->charbuf_used;
6082
6083   if (NILP (coding->dst_object))
6084     return;
6085
6086   while (charbuf < charbuf_end)
6087     {
6088       if (*charbuf >= 0)
6089         pos += *charbuf++;
6090       else
6091         {
6092           int len = -*charbuf;
6093           switch (charbuf[1])
6094             {
6095             case CODING_ANNOTATE_COMPOSITION_MASK:
6096               produce_composition (coding, charbuf, pos);
6097               break;
6098             case CODING_ANNOTATE_CHARSET_MASK:
6099               produce_charset (coding, charbuf, pos);
6100               break;
6101             default:
6102               abort ();
6103             }
6104           charbuf += len;
6105         }
6106     }
6107 }
6108
6109 /* Decode the data at CODING->src_object into CODING->dst_object.
6110    CODING->src_object is a buffer, a string, or nil.
6111    CODING->dst_object is a buffer.
6112
6113    If CODING->src_object is a buffer, it must be the current buffer.
6114    In this case, if CODING->src_pos is positive, it is a position of
6115    the source text in the buffer, otherwise, the source text is in the
6116    gap area of the buffer, and CODING->src_pos specifies the offset of
6117    the text from GPT (which must be the same as PT).  If this is the
6118    same buffer as CODING->dst_object, CODING->src_pos must be
6119    negative.
6120
6121    If CODING->src_object is a string, CODING->src_pos in an index to
6122    that string.
6123
6124    If CODING->src_object is nil, CODING->source must already point to
6125    the non-relocatable memory area.  In this case, CODING->src_pos is
6126    an offset from CODING->source.
6127
6128    The decoded data is inserted at the current point of the buffer
6129    CODING->dst_object.
6130 */
6131
6132 static int
6133 decode_coding (coding)
6134      struct coding_system *coding;
6135 {
6136   Lisp_Object attrs;
6137   Lisp_Object undo_list;
6138   Lisp_Object translation_table;
6139   int carryover;
6140   int i;
6141
6142   if (BUFFERP (coding->src_object)
6143       && coding->src_pos > 0
6144       && coding->src_pos < GPT
6145       && coding->src_pos + coding->src_chars > GPT)
6146     move_gap_both (coding->src_pos, coding->src_pos_byte);
6147
6148   undo_list = Qt;
6149   if (BUFFERP (coding->dst_object))
6150     {
6151       if (current_buffer != XBUFFER (coding->dst_object))
6152         set_buffer_internal (XBUFFER (coding->dst_object));
6153       if (GPT != PT)
6154         move_gap_both (PT, PT_BYTE);
6155       undo_list = current_buffer->undo_list;
6156       current_buffer->undo_list = Qt;
6157     }
6158
6159   coding->consumed = coding->consumed_char = 0;
6160   coding->produced = coding->produced_char = 0;
6161   coding->chars_at_source = 0;
6162   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6163   coding->errors = 0;
6164
6165   ALLOC_CONVERSION_WORK_AREA (coding);
6166
6167   attrs = CODING_ID_ATTRS (coding->id);
6168   translation_table = get_translation_table (attrs, 0, NULL);
6169
6170   carryover = 0;
6171   do
6172     {
6173       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6174
6175       coding_set_source (coding);
6176       coding->annotated = 0;
6177       coding->charbuf_used = carryover;
6178       (*(coding->decoder)) (coding);
6179       coding_set_destination (coding);
6180       carryover = produce_chars (coding, translation_table, 0);
6181       if (coding->annotated)
6182         produce_annotation (coding, pos);
6183       for (i = 0; i < carryover; i++)
6184         coding->charbuf[i]
6185           = coding->charbuf[coding->charbuf_used - carryover + i];
6186     }
6187   while (coding->consumed < coding->src_bytes
6188          && (coding->result == CODING_RESULT_SUCCESS
6189              || coding->result == CODING_RESULT_INVALID_SRC));
6190
6191   if (carryover > 0)
6192     {
6193       coding_set_destination (coding);
6194       coding->charbuf_used = carryover;
6195       produce_chars (coding, translation_table, 1);
6196     }
6197
6198   coding->carryover_bytes = 0;
6199   if (coding->consumed < coding->src_bytes)
6200     {
6201       int nbytes = coding->src_bytes - coding->consumed;
6202       const unsigned char *src;
6203
6204       coding_set_source (coding);
6205       coding_set_destination (coding);
6206       src = coding->source + coding->consumed;
6207
6208       if (coding->mode & CODING_MODE_LAST_BLOCK)
6209         {
6210           /* Flush out unprocessed data as binary chars.  We are sure
6211              that the number of data is less than the size of
6212              coding->charbuf.  */
6213           coding->charbuf_used = 0;
6214           while (nbytes-- > 0)
6215             {
6216               int c = *src++;
6217
6218               if (c & 0x80)
6219                 c = BYTE8_TO_CHAR (c);
6220               coding->charbuf[coding->charbuf_used++] = c;
6221             }
6222           produce_chars (coding, Qnil, 1);
6223         }
6224       else
6225         {
6226           /* Record unprocessed bytes in coding->carryover.  We are
6227              sure that the number of data is less than the size of
6228              coding->carryover.  */
6229           unsigned char *p = coding->carryover;
6230
6231           coding->carryover_bytes = nbytes;
6232           while (nbytes-- > 0)
6233             *p++ = *src++;
6234         }
6235       coding->consumed = coding->src_bytes;
6236     }
6237
6238   if (BUFFERP (coding->dst_object))
6239     {
6240       current_buffer->undo_list = undo_list;
6241       record_insert (coding->dst_pos, coding->produced_char);
6242     }
6243   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6244     decode_eol (coding);
6245   return coding->result;
6246 }
6247
6248
6249 /* Extract an annotation datum from a composition starting at POS and
6250    ending before LIMIT of CODING->src_object (buffer or string), store
6251    the data in BUF, set *STOP to a starting position of the next
6252    composition (if any) or to LIMIT, and return the address of the
6253    next element of BUF.
6254
6255    If such an annotation is not found, set *STOP to a starting
6256    position of a composition after POS (if any) or to LIMIT, and
6257    return BUF.  */
6258
6259 static INLINE int *
6260 handle_composition_annotation (pos, limit, coding, buf, stop)
6261      EMACS_INT pos, limit;
6262      struct coding_system *coding;
6263      int *buf;
6264      EMACS_INT *stop;
6265 {
6266   EMACS_INT start, end;
6267   Lisp_Object prop;
6268
6269   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6270       || end > limit)
6271     *stop = limit;
6272   else if (start > pos)
6273     *stop = start;
6274   else
6275     {
6276       if (start == pos)
6277         {
6278           /* We found a composition.  Store the corresponding
6279              annotation data in BUF.  */
6280           int *head = buf;
6281           enum composition_method method = COMPOSITION_METHOD (prop);
6282           int nchars = COMPOSITION_LENGTH (prop);
6283
6284           ADD_COMPOSITION_DATA (buf, nchars, method);
6285           if (method != COMPOSITION_RELATIVE)
6286             {
6287               Lisp_Object components;
6288               int len, i, i_byte;
6289
6290               components = COMPOSITION_COMPONENTS (prop);
6291               if (VECTORP (components))
6292                 {
6293                   len = XVECTOR (components)->size;
6294                   for (i = 0; i < len; i++)
6295                     *buf++ = XINT (AREF (components, i));
6296                 }
6297               else if (STRINGP (components))
6298                 {
6299                   len = SCHARS (components);
6300                   i = i_byte = 0;
6301                   while (i < len)
6302                     {
6303                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6304                       buf++;
6305                     }
6306                 }
6307               else if (INTEGERP (components))
6308                 {
6309                   len = 1;
6310                   *buf++ = XINT (components);
6311                 }
6312               else if (CONSP (components))
6313                 {
6314                   for (len = 0; CONSP (components);
6315                        len++, components = XCDR (components))
6316                     *buf++ = XINT (XCAR (components));
6317                 }
6318               else
6319                 abort ();
6320               *head -= len;
6321             }
6322         }
6323
6324       if (find_composition (end, limit, &start, &end, &prop,
6325                             coding->src_object)
6326           && end <= limit)
6327         *stop = start;
6328       else
6329         *stop = limit;
6330     }
6331   return buf;
6332 }
6333
6334
6335 /* Extract an annotation datum from a text property `charset' at POS of
6336    CODING->src_object (buffer of string), store the data in BUF, set
6337    *STOP to the position where the value of `charset' property changes
6338    (limiting by LIMIT), and return the address of the next element of
6339    BUF.
6340
6341    If the property value is nil, set *STOP to the position where the
6342    property value is non-nil (limiting by LIMIT), and return BUF.  */
6343
6344 static INLINE int *
6345 handle_charset_annotation (pos, limit, coding, buf, stop)
6346      EMACS_INT pos, limit;
6347      struct coding_system *coding;
6348      int *buf;
6349      EMACS_INT *stop;
6350 {
6351   Lisp_Object val, next;
6352   int id;
6353
6354   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6355   if (! NILP (val) && CHARSETP (val))
6356     id = XINT (CHARSET_SYMBOL_ID (val));
6357   else
6358     id = -1;
6359   ADD_CHARSET_DATA (buf, 0, id);
6360   next = Fnext_single_property_change (make_number (pos), Qcharset,
6361                                        coding->src_object,
6362                                        make_number (limit));
6363   *stop = XINT (next);
6364   return buf;
6365 }
6366
6367
6368 static void
6369 consume_chars (coding, translation_table, max_lookup)
6370      struct coding_system *coding;
6371      Lisp_Object translation_table;
6372      int max_lookup;
6373 {
6374   int *buf = coding->charbuf;
6375   int *buf_end = coding->charbuf + coding->charbuf_size;
6376   const unsigned char *src = coding->source + coding->consumed;
6377   const unsigned char *src_end = coding->source + coding->src_bytes;
6378   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6379   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6380   int multibytep = coding->src_multibyte;
6381   Lisp_Object eol_type;
6382   int c;
6383   EMACS_INT stop, stop_composition, stop_charset;
6384   int *lookup_buf = NULL;
6385
6386   if (! NILP (translation_table))
6387     lookup_buf = alloca (sizeof (int) * max_lookup);
6388
6389   eol_type = CODING_ID_EOL_TYPE (coding->id);
6390   if (VECTORP (eol_type))
6391     eol_type = Qunix;
6392
6393   /* Note: composition handling is not yet implemented.  */
6394   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6395
6396   if (NILP (coding->src_object))
6397     stop = stop_composition = stop_charset = end_pos;
6398   else
6399     {
6400       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6401         stop = stop_composition = pos;
6402       else
6403         stop = stop_composition = end_pos;
6404       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6405         stop = stop_charset = pos;
6406       else
6407         stop_charset = end_pos;
6408     }
6409
6410   /* Compensate for CRLF and conversion.  */
6411   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6412   while (buf < buf_end)
6413     {
6414       Lisp_Object trans;
6415
6416       if (pos == stop)
6417         {
6418           if (pos == end_pos)
6419             break;
6420           if (pos == stop_composition)
6421             buf = handle_composition_annotation (pos, end_pos, coding,
6422                                                  buf, &stop_composition);
6423           if (pos == stop_charset)
6424             buf = handle_charset_annotation (pos, end_pos, coding,
6425                                              buf, &stop_charset);
6426           stop = (stop_composition < stop_charset
6427                   ? stop_composition : stop_charset);
6428         }
6429
6430       if (! multibytep)
6431         {
6432           EMACS_INT bytes;
6433
6434           if (coding->encoder == encode_coding_raw_text)
6435             c = *src++, pos++;
6436           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6437             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6438           else
6439             c = BYTE8_TO_CHAR (*src), src++, pos++;
6440         }
6441       else
6442         c = STRING_CHAR_ADVANCE (src), pos++;
6443       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6444         c = '\n';
6445       if (! EQ (eol_type, Qunix))
6446         {
6447           if (c == '\n')
6448             {
6449               if (EQ (eol_type, Qdos))
6450                 *buf++ = '\r';
6451               else
6452                 c = '\r';
6453             }
6454         }
6455
6456       trans = Qnil;
6457       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6458       if (NILP (trans))
6459         *buf++ = c;
6460       else
6461         {
6462           int from_nchars = 1, to_nchars = 1;
6463           int *lookup_buf_end;
6464           const unsigned char *p = src;
6465           int i;
6466
6467           lookup_buf[0] = c;
6468           for (i = 1; i < max_lookup && p < src_end; i++)
6469             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6470           lookup_buf_end = lookup_buf + i;
6471           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6472                                    &from_nchars, &to_nchars);
6473           if (EQ (trans, Qt)
6474               || buf + to_nchars > buf_end)
6475             break;
6476           *buf++ = *lookup_buf;
6477           for (i = 1; i < to_nchars; i++)
6478             *buf++ = XINT (AREF (trans, i));
6479           for (i = 1; i < from_nchars; i++, pos++)
6480             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6481         }
6482     }
6483
6484   coding->consumed = src - coding->source;
6485   coding->consumed_char = pos - coding->src_pos;
6486   coding->charbuf_used = buf - coding->charbuf;
6487   coding->chars_at_source = 0;
6488 }
6489
6490
6491 /* Encode the text at CODING->src_object into CODING->dst_object.
6492    CODING->src_object is a buffer or a string.
6493    CODING->dst_object is a buffer or nil.
6494
6495    If CODING->src_object is a buffer, it must be the current buffer.
6496    In this case, if CODING->src_pos is positive, it is a position of
6497    the source text in the buffer, otherwise. the source text is in the
6498    gap area of the buffer, and coding->src_pos specifies the offset of
6499    the text from GPT (which must be the same as PT).  If this is the
6500    same buffer as CODING->dst_object, CODING->src_pos must be
6501    negative and CODING should not have `pre-write-conversion'.
6502
6503    If CODING->src_object is a string, CODING should not have
6504    `pre-write-conversion'.
6505
6506    If CODING->dst_object is a buffer, the encoded data is inserted at
6507    the current point of that buffer.
6508
6509    If CODING->dst_object is nil, the encoded data is placed at the
6510    memory area specified by CODING->destination.  */
6511
6512 static int
6513 encode_coding (coding)
6514      struct coding_system *coding;
6515 {
6516   Lisp_Object attrs;
6517   Lisp_Object translation_table;
6518   int max_lookup;
6519
6520   attrs = CODING_ID_ATTRS (coding->id);
6521   if (coding->encoder == encode_coding_raw_text)
6522     translation_table = Qnil, max_lookup = 0;
6523   else
6524     translation_table = get_translation_table (attrs, 1, &max_lookup);
6525
6526   if (BUFFERP (coding->dst_object))
6527     {
6528       set_buffer_internal (XBUFFER (coding->dst_object));
6529       coding->dst_multibyte
6530         = ! NILP (current_buffer->enable_multibyte_characters);
6531     }
6532
6533   coding->consumed = coding->consumed_char = 0;
6534   coding->produced = coding->produced_char = 0;
6535   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6536   coding->errors = 0;
6537
6538   ALLOC_CONVERSION_WORK_AREA (coding);
6539
6540   do {
6541     coding_set_source (coding);
6542     consume_chars (coding, translation_table, max_lookup);
6543     coding_set_destination (coding);
6544     (*(coding->encoder)) (coding);
6545   } while (coding->consumed_char < coding->src_chars);
6546
6547   if (BUFFERP (coding->dst_object))
6548     insert_from_gap (coding->produced_char, coding->produced);
6549
6550   return (coding->result);
6551 }
6552
6553
6554 /* Name (or base name) of work buffer for code conversion.  */
6555 static Lisp_Object Vcode_conversion_workbuf_name;
6556
6557 /* A working buffer used by the top level conversion.  Once it is
6558    created, it is never destroyed.  It has the name
6559    Vcode_conversion_workbuf_name.  The other working buffers are
6560    destroyed after the use is finished, and their names are modified
6561    versions of Vcode_conversion_workbuf_name.  */
6562 static Lisp_Object Vcode_conversion_reused_workbuf;
6563
6564 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6565 static int reused_workbuf_in_use;
6566
6567
6568 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6569    multibyteness of returning buffer.  */
6570
6571 static Lisp_Object
6572 make_conversion_work_buffer (multibyte)
6573      int multibyte;
6574 {
6575   Lisp_Object name, workbuf;
6576   struct buffer *current;
6577
6578   if (reused_workbuf_in_use++)
6579     {
6580       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6581       workbuf = Fget_buffer_create (name);
6582     }
6583   else
6584     {
6585       name = Vcode_conversion_workbuf_name;
6586       workbuf = Fget_buffer_create (name);
6587       if (NILP (Vcode_conversion_reused_workbuf))
6588         Vcode_conversion_reused_workbuf = workbuf;
6589     }
6590   current = current_buffer;
6591   set_buffer_internal (XBUFFER (workbuf));
6592   Ferase_buffer ();
6593   current_buffer->undo_list = Qt;
6594   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6595   set_buffer_internal (current);
6596   return workbuf;
6597 }
6598
6599
6600 static Lisp_Object
6601 code_conversion_restore (arg)
6602      Lisp_Object arg;
6603 {
6604   Lisp_Object current, workbuf;
6605   struct gcpro gcpro1;
6606
6607   GCPRO1 (arg);
6608   current = XCAR (arg);
6609   workbuf = XCDR (arg);
6610   if (! NILP (workbuf))
6611     {
6612       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6613         reused_workbuf_in_use = 0;
6614       else if (! NILP (Fbuffer_live_p (workbuf)))
6615         Fkill_buffer (workbuf);
6616     }
6617   set_buffer_internal (XBUFFER (current));
6618   UNGCPRO;
6619   return Qnil;
6620 }
6621
6622 Lisp_Object
6623 code_conversion_save (with_work_buf, multibyte)
6624      int with_work_buf, multibyte;
6625 {
6626   Lisp_Object workbuf = Qnil;
6627
6628   if (with_work_buf)
6629     workbuf = make_conversion_work_buffer (multibyte);
6630   record_unwind_protect (code_conversion_restore,
6631                          Fcons (Fcurrent_buffer (), workbuf));
6632   return workbuf;
6633 }
6634
6635 int
6636 decode_coding_gap (coding, chars, bytes)
6637      struct coding_system *coding;
6638      EMACS_INT chars, bytes;
6639 {
6640   int count = specpdl_ptr - specpdl;
6641   Lisp_Object attrs;
6642
6643   code_conversion_save (0, 0);
6644
6645   coding->src_object = Fcurrent_buffer ();
6646   coding->src_chars = chars;
6647   coding->src_bytes = bytes;
6648   coding->src_pos = -chars;
6649   coding->src_pos_byte = -bytes;
6650   coding->src_multibyte = chars < bytes;
6651   coding->dst_object = coding->src_object;
6652   coding->dst_pos = PT;
6653   coding->dst_pos_byte = PT_BYTE;
6654   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6655   coding->mode |= CODING_MODE_LAST_BLOCK;
6656
6657   if (CODING_REQUIRE_DETECTION (coding))
6658     detect_coding (coding);
6659
6660   decode_coding (coding);
6661
6662   attrs = CODING_ID_ATTRS (coding->id);
6663   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6664     {
6665       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6666       Lisp_Object val;
6667
6668       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6669       val = call1 (CODING_ATTR_POST_READ (attrs),
6670                    make_number (coding->produced_char));
6671       CHECK_NATNUM (val);
6672       coding->produced_char += Z - prev_Z;
6673       coding->produced += Z_BYTE - prev_Z_BYTE;
6674     }
6675
6676   unbind_to (count, Qnil);
6677   return coding->result;
6678 }
6679
6680 int
6681 encode_coding_gap (coding, chars, bytes)
6682      struct coding_system *coding;
6683      EMACS_INT chars, bytes;
6684 {
6685   int count = specpdl_ptr - specpdl;
6686
6687   code_conversion_save (0, 0);
6688
6689   coding->src_object = Fcurrent_buffer ();
6690   coding->src_chars = chars;
6691   coding->src_bytes = bytes;
6692   coding->src_pos = -chars;
6693   coding->src_pos_byte = -bytes;
6694   coding->src_multibyte = chars < bytes;
6695   coding->dst_object = coding->src_object;
6696   coding->dst_pos = PT;
6697   coding->dst_pos_byte = PT_BYTE;
6698
6699   encode_coding (coding);
6700
6701   unbind_to (count, Qnil);
6702   return coding->result;
6703 }
6704
6705
6706 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6707    SRC_OBJECT into DST_OBJECT by coding context CODING.
6708
6709    SRC_OBJECT is a buffer, a string, or Qnil.
6710
6711    If it is a buffer, the text is at point of the buffer.  FROM and TO
6712    are positions in the buffer.
6713
6714    If it is a string, the text is at the beginning of the string.
6715    FROM and TO are indices to the string.
6716
6717    If it is nil, the text is at coding->source.  FROM and TO are
6718    indices to coding->source.
6719
6720    DST_OBJECT is a buffer, Qt, or Qnil.
6721
6722    If it is a buffer, the decoded text is inserted at point of the
6723    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6724    is deleted.
6725
6726    If it is Qt, a string is made from the decoded text, and
6727    set in CODING->dst_object.
6728
6729    If it is Qnil, the decoded text is stored at CODING->destination.
6730    The caller must allocate CODING->dst_bytes bytes at
6731    CODING->destination by xmalloc.  If the decoded text is longer than
6732    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6733  */
6734
6735 void
6736 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6737                       dst_object)
6738      struct coding_system *coding;
6739      Lisp_Object src_object;
6740      EMACS_INT from, from_byte, to, to_byte;
6741      Lisp_Object dst_object;
6742 {
6743   int count = specpdl_ptr - specpdl;
6744   unsigned char *destination;
6745   EMACS_INT dst_bytes;
6746   EMACS_INT chars = to - from;
6747   EMACS_INT bytes = to_byte - from_byte;
6748   Lisp_Object attrs;
6749   Lisp_Object buffer;
6750   int saved_pt = -1, saved_pt_byte;
6751
6752   buffer = Fcurrent_buffer ();
6753
6754   if (NILP (dst_object))
6755     {
6756       destination = coding->destination;
6757       dst_bytes = coding->dst_bytes;
6758     }
6759
6760   coding->src_object = src_object;
6761   coding->src_chars = chars;
6762   coding->src_bytes = bytes;
6763   coding->src_multibyte = chars < bytes;
6764
6765   if (STRINGP (src_object))
6766     {
6767       coding->src_pos = from;
6768       coding->src_pos_byte = from_byte;
6769     }
6770   else if (BUFFERP (src_object))
6771     {
6772       set_buffer_internal (XBUFFER (src_object));
6773       if (from != GPT)
6774         move_gap_both (from, from_byte);
6775       if (EQ (src_object, dst_object))
6776         {
6777           saved_pt = PT, saved_pt_byte = PT_BYTE;
6778           TEMP_SET_PT_BOTH (from, from_byte);
6779           del_range_both (from, from_byte, to, to_byte, 1);
6780           coding->src_pos = -chars;
6781           coding->src_pos_byte = -bytes;
6782         }
6783       else
6784         {
6785           coding->src_pos = from;
6786           coding->src_pos_byte = from_byte;
6787         }
6788     }
6789
6790   if (CODING_REQUIRE_DETECTION (coding))
6791     detect_coding (coding);
6792   attrs = CODING_ID_ATTRS (coding->id);
6793
6794   if (EQ (dst_object, Qt)
6795       || (! NILP (CODING_ATTR_POST_READ (attrs))
6796           && NILP (dst_object)))
6797     {
6798       coding->dst_object = code_conversion_save (1, 1);
6799       coding->dst_pos = BEG;
6800       coding->dst_pos_byte = BEG_BYTE;
6801       coding->dst_multibyte = 1;
6802     }
6803   else if (BUFFERP (dst_object))
6804     {
6805       code_conversion_save (0, 0);
6806       coding->dst_object = dst_object;
6807       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6808       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6809       coding->dst_multibyte
6810         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6811     }
6812   else
6813     {
6814       code_conversion_save (0, 0);
6815       coding->dst_object = Qnil;
6816       coding->dst_multibyte = 1;
6817     }
6818
6819   decode_coding (coding);
6820
6821   if (BUFFERP (coding->dst_object))
6822     set_buffer_internal (XBUFFER (coding->dst_object));
6823
6824   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6825     {
6826       struct gcpro gcpro1, gcpro2;
6827       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6828       Lisp_Object val;
6829
6830       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6831       GCPRO2 (coding->src_object, coding->dst_object);
6832       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6833                         make_number (coding->produced_char));
6834       UNGCPRO;
6835       CHECK_NATNUM (val);
6836       coding->produced_char += Z - prev_Z;
6837       coding->produced += Z_BYTE - prev_Z_BYTE;
6838     }
6839
6840   if (EQ (dst_object, Qt))
6841     {
6842       coding->dst_object = Fbuffer_string ();
6843     }
6844   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6845     {
6846       set_buffer_internal (XBUFFER (coding->dst_object));
6847       if (dst_bytes < coding->produced)
6848         {
6849           destination
6850             = (unsigned char *) xrealloc (destination, coding->produced);
6851           if (! destination)
6852             {
6853               record_conversion_result (coding,
6854                                         CODING_RESULT_INSUFFICIENT_DST);
6855               unbind_to (count, Qnil);
6856               return;
6857             }
6858           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6859             move_gap_both (BEGV, BEGV_BYTE);
6860           bcopy (BEGV_ADDR, destination, coding->produced);
6861           coding->destination = destination;
6862         }
6863     }
6864
6865   if (saved_pt >= 0)
6866     {
6867       /* This is the case of:
6868          (BUFFERP (src_object) && EQ (src_object, dst_object))
6869          As we have moved PT while replacing the original buffer
6870          contents, we must recover it now.  */
6871       set_buffer_internal (XBUFFER (src_object));
6872       if (saved_pt < from)
6873         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6874       else if (saved_pt < from + chars)
6875         TEMP_SET_PT_BOTH (from, from_byte);
6876       else if (! NILP (current_buffer->enable_multibyte_characters))
6877         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6878                           saved_pt_byte + (coding->produced - bytes));
6879       else
6880         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6881                           saved_pt_byte + (coding->produced - bytes));
6882     }
6883
6884   unbind_to (count, coding->dst_object);
6885 }
6886
6887
6888 void
6889 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6890                       dst_object)
6891      struct coding_system *coding;
6892      Lisp_Object src_object;
6893      EMACS_INT from, from_byte, to, to_byte;
6894      Lisp_Object dst_object;
6895 {
6896   int count = specpdl_ptr - specpdl;
6897   EMACS_INT chars = to - from;
6898   EMACS_INT bytes = to_byte - from_byte;
6899   Lisp_Object attrs;
6900   Lisp_Object buffer;
6901   int saved_pt = -1, saved_pt_byte;
6902   int kill_src_buffer = 0;
6903
6904   buffer = Fcurrent_buffer ();
6905
6906   coding->src_object = src_object;
6907   coding->src_chars = chars;
6908   coding->src_bytes = bytes;
6909   coding->src_multibyte = chars < bytes;
6910
6911   attrs = CODING_ID_ATTRS (coding->id);
6912
6913   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6914     {
6915       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6916       set_buffer_internal (XBUFFER (coding->src_object));
6917       if (STRINGP (src_object))
6918         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6919       else if (BUFFERP (src_object))
6920         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6921       else
6922         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6923
6924       if (EQ (src_object, dst_object))
6925         {
6926           set_buffer_internal (XBUFFER (src_object));
6927           saved_pt = PT, saved_pt_byte = PT_BYTE;
6928           del_range_both (from, from_byte, to, to_byte, 1);
6929           set_buffer_internal (XBUFFER (coding->src_object));
6930         }
6931
6932       {
6933         Lisp_Object args[3];
6934
6935         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6936         args[1] = make_number (BEG);
6937         args[2] = make_number (Z);
6938         safe_call (3, args);
6939       }
6940       if (XBUFFER (coding->src_object) != current_buffer)
6941         kill_src_buffer = 1;
6942       coding->src_object = Fcurrent_buffer ();
6943       if (BEG != GPT)
6944         move_gap_both (BEG, BEG_BYTE);
6945       coding->src_chars = Z - BEG;
6946       coding->src_bytes = Z_BYTE - BEG_BYTE;
6947       coding->src_pos = BEG;
6948       coding->src_pos_byte = BEG_BYTE;
6949       coding->src_multibyte = Z < Z_BYTE;
6950     }
6951   else if (STRINGP (src_object))
6952     {
6953       code_conversion_save (0, 0);
6954       coding->src_pos = from;
6955       coding->src_pos_byte = from_byte;
6956     }
6957   else if (BUFFERP (src_object))
6958     {
6959       code_conversion_save (0, 0);
6960       set_buffer_internal (XBUFFER (src_object));
6961       if (EQ (src_object, dst_object))
6962         {
6963           saved_pt = PT, saved_pt_byte = PT_BYTE;
6964           coding->src_object = del_range_1 (from, to, 1, 1);
6965           coding->src_pos = 0;
6966           coding->src_pos_byte = 0;
6967         }
6968       else
6969         {
6970           if (from < GPT && to >= GPT)
6971             move_gap_both (from, from_byte);
6972           coding->src_pos = from;
6973           coding->src_pos_byte = from_byte;
6974         }
6975     }
6976   else
6977     code_conversion_save (0, 0);
6978
6979   if (BUFFERP (dst_object))
6980     {
6981       coding->dst_object = dst_object;
6982       if (EQ (src_object, dst_object))
6983         {
6984           coding->dst_pos = from;
6985           coding->dst_pos_byte = from_byte;
6986         }
6987       else
6988         {
6989           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6990           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6991         }
6992       coding->dst_multibyte
6993         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6994     }
6995   else if (EQ (dst_object, Qt))
6996     {
6997       coding->dst_object = Qnil;
6998       coding->dst_bytes = coding->src_chars;
6999       if (coding->dst_bytes == 0)
7000         coding->dst_bytes = 1;
7001       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7002       coding->dst_multibyte = 0;
7003     }
7004   else
7005     {
7006       coding->dst_object = Qnil;
7007       coding->dst_multibyte = 0;
7008     }
7009
7010   encode_coding (coding);
7011
7012   if (EQ (dst_object, Qt))
7013     {
7014       if (BUFFERP (coding->dst_object))
7015         coding->dst_object = Fbuffer_string ();
7016       else
7017         {
7018           coding->dst_object
7019             = make_unibyte_string ((char *) coding->destination,
7020                                    coding->produced);
7021           xfree (coding->destination);
7022         }
7023     }
7024
7025   if (saved_pt >= 0)
7026     {
7027       /* This is the case of:
7028          (BUFFERP (src_object) && EQ (src_object, dst_object))
7029          As we have moved PT while replacing the original buffer
7030          contents, we must recover it now.  */
7031       set_buffer_internal (XBUFFER (src_object));
7032       if (saved_pt < from)
7033         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7034       else if (saved_pt < from + chars)
7035         TEMP_SET_PT_BOTH (from, from_byte);
7036       else if (! NILP (current_buffer->enable_multibyte_characters))
7037         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7038                           saved_pt_byte + (coding->produced - bytes));
7039       else
7040         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7041                           saved_pt_byte + (coding->produced - bytes));
7042     }
7043
7044   if (kill_src_buffer)
7045     Fkill_buffer (coding->src_object);
7046   unbind_to (count, Qnil);
7047 }
7048
7049
7050 Lisp_Object
7051 preferred_coding_system ()
7052 {
7053   int id = coding_categories[coding_priorities[0]].id;
7054
7055   return CODING_ID_NAME (id);
7056 }
7057
7058 \f
7059 #ifdef emacs
7060 /*** 8. Emacs Lisp library functions ***/
7061
7062 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7063        doc: /* Return t if OBJECT is nil or a coding-system.
7064 See the documentation of `define-coding-system' for information
7065 about coding-system objects.  */)
7066      (obj)
7067      Lisp_Object obj;
7068 {
7069   if (NILP (obj)
7070       || CODING_SYSTEM_ID (obj) >= 0)
7071     return Qt;
7072   if (! SYMBOLP (obj)
7073       || NILP (Fget (obj, Qcoding_system_define_form)))
7074     return Qnil;
7075   return Qt;
7076 }
7077
7078 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7079        Sread_non_nil_coding_system, 1, 1, 0,
7080        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7081      (prompt)
7082      Lisp_Object prompt;
7083 {
7084   Lisp_Object val;
7085   do
7086     {
7087       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7088                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7089     }
7090   while (SCHARS (val) == 0);
7091   return (Fintern (val, Qnil));
7092 }
7093
7094 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7095        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7096 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7097      (prompt, default_coding_system)
7098      Lisp_Object prompt, default_coding_system;
7099 {
7100   Lisp_Object val;
7101   if (SYMBOLP (default_coding_system))
7102     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7103   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7104                           Qt, Qnil, Qcoding_system_history,
7105                           default_coding_system, Qnil);
7106   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7107 }
7108
7109 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7110        1, 1, 0,
7111        doc: /* Check validity of CODING-SYSTEM.
7112 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7113 It is valid if it is nil or a symbol defined as a coding system by the
7114 function `define-coding-system'.  */)
7115   (coding_system)
7116      Lisp_Object coding_system;
7117 {
7118   Lisp_Object define_form;
7119
7120   define_form = Fget (coding_system, Qcoding_system_define_form);
7121   if (! NILP (define_form))
7122     {
7123       Fput (coding_system, Qcoding_system_define_form, Qnil);
7124       safe_eval (define_form);
7125     }
7126   if (!NILP (Fcoding_system_p (coding_system)))
7127     return coding_system;
7128   while (1)
7129     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7130 }
7131
7132 \f
7133 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7134    HIGHEST is nonzero, return the coding system of the highest
7135    priority among the detected coding systems.  Otherwize return a
7136    list of detected coding systems sorted by their priorities.  If
7137    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7138    multibyte form but contains only ASCII and eight-bit chars.
7139    Otherwise, the bytes are raw bytes.
7140
7141    CODING-SYSTEM controls the detection as below:
7142
7143    If it is nil, detect both text-format and eol-format.  If the
7144    text-format part of CODING-SYSTEM is already specified
7145    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7146    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7147    detect only text-format.  */
7148
7149 Lisp_Object
7150 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7151                       coding_system)
7152      const unsigned char *src;
7153      int src_chars, src_bytes, highest;
7154      int multibytep;
7155      Lisp_Object coding_system;
7156 {
7157   const unsigned char *src_end = src + src_bytes;
7158   Lisp_Object attrs, eol_type;
7159   Lisp_Object val;
7160   struct coding_system coding;
7161   int id;
7162   struct coding_detection_info detect_info;
7163   enum coding_category base_category;
7164
7165   if (NILP (coding_system))
7166     coding_system = Qundecided;
7167   setup_coding_system (coding_system, &coding);
7168   attrs = CODING_ID_ATTRS (coding.id);
7169   eol_type = CODING_ID_EOL_TYPE (coding.id);
7170   coding_system = CODING_ATTR_BASE_NAME (attrs);
7171
7172   coding.source = src;
7173   coding.src_chars = src_chars;
7174   coding.src_bytes = src_bytes;
7175   coding.src_multibyte = multibytep;
7176   coding.consumed = 0;
7177   coding.mode |= CODING_MODE_LAST_BLOCK;
7178
7179   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7180
7181   /* At first, detect text-format if necessary.  */
7182   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7183   if (base_category == coding_category_undecided)
7184     {
7185       enum coding_category category;
7186       struct coding_system *this;
7187       int c, i;
7188
7189       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7190       for (i = 0; src < src_end; i++, src++)
7191         {
7192           c = *src;
7193           if (c & 0x80)
7194             break;
7195           if (c < 0x20
7196               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7197               && inhibit_iso_escape_detection)
7198             {
7199               coding.head_ascii = src - coding.source;
7200               if (detect_coding_iso_2022 (&coding, &detect_info))
7201                 {
7202                   /* We have scanned the whole data.  */
7203                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7204                     /* We didn't find an 8-bit code.  */
7205                     src = src_end;
7206                   break;
7207                 }
7208             }
7209         }
7210       coding.head_ascii = src - coding.source;
7211
7212       if (src < src_end
7213           || detect_info.found)
7214         {
7215           if (src == src_end)
7216             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7217             for (i = 0; i < coding_category_raw_text; i++)
7218               {
7219                 category = coding_priorities[i];
7220                 if (detect_info.found & (1 << category))
7221                   break;
7222               }
7223           else
7224             for (i = 0; i < coding_category_raw_text; i++)
7225               {
7226                 category = coding_priorities[i];
7227                 this = coding_categories + category;
7228
7229                 if (this->id < 0)
7230                   {
7231                     /* No coding system of this category is defined.  */
7232                     detect_info.rejected |= (1 << category);
7233                   }
7234                 else if (category >= coding_category_raw_text)
7235                   continue;
7236                 else if (detect_info.checked & (1 << category))
7237                   {
7238                     if (highest
7239                         && (detect_info.found & (1 << category)))
7240                       break;
7241                   }
7242                 else
7243                   {
7244                     if ((*(this->detector)) (&coding, &detect_info)
7245                         && highest
7246                         && (detect_info.found & (1 << category)))
7247                       {
7248                         if (category == coding_category_utf_16_auto)
7249                           {
7250                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7251                               category = coding_category_utf_16_le;
7252                             else
7253                               category = coding_category_utf_16_be;
7254                           }
7255                         break;
7256                       }
7257                   }
7258               }
7259         }
7260
7261       if (detect_info.rejected == CATEGORY_MASK_ANY)
7262         {
7263           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7264           id = coding_categories[coding_category_raw_text].id;
7265           val = Fcons (make_number (id), Qnil);
7266         }
7267       else if (! detect_info.rejected && ! detect_info.found)
7268         {
7269           detect_info.found = CATEGORY_MASK_ANY;
7270           id = coding_categories[coding_category_undecided].id;
7271           val = Fcons (make_number (id), Qnil);
7272         }
7273       else if (highest)
7274         {
7275           if (detect_info.found)
7276             {
7277               detect_info.found = 1 << category;
7278               val = Fcons (make_number (this->id), Qnil);
7279             }
7280           else
7281             for (i = 0; i < coding_category_raw_text; i++)
7282               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7283                 {
7284                   detect_info.found = 1 << coding_priorities[i];
7285                   id = coding_categories[coding_priorities[i]].id;
7286                   val = Fcons (make_number (id), Qnil);
7287                   break;
7288                 }
7289         }
7290       else
7291         {
7292           int mask = detect_info.rejected | detect_info.found;
7293           int found = 0;
7294           val = Qnil;
7295
7296           for (i = coding_category_raw_text - 1; i >= 0; i--)
7297             {
7298               category = coding_priorities[i];
7299               if (! (mask & (1 << category)))
7300                 {
7301                   found |= 1 << category;
7302                   id = coding_categories[category].id;
7303                   val = Fcons (make_number (id), val);
7304                 }
7305             }
7306           for (i = coding_category_raw_text - 1; i >= 0; i--)
7307             {
7308               category = coding_priorities[i];
7309               if (detect_info.found & (1 << category))
7310                 {
7311                   id = coding_categories[category].id;
7312                   val = Fcons (make_number (id), val);
7313                 }
7314             }
7315           detect_info.found |= found;
7316         }
7317     }
7318   else if (base_category == coding_category_utf_16_auto)
7319     {
7320       if (detect_coding_utf_16 (&coding, &detect_info))
7321         {
7322           struct coding_system *this;
7323
7324           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7325             this = coding_categories + coding_category_utf_16_le;
7326           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7327             this = coding_categories + coding_category_utf_16_be;
7328           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7329             this = coding_categories + coding_category_utf_16_be_nosig;
7330           else
7331             this = coding_categories + coding_category_utf_16_le_nosig;
7332           val = Fcons (make_number (this->id), Qnil);
7333         }
7334     }
7335   else
7336     {
7337       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7338       val = Fcons (make_number (coding.id), Qnil);
7339     }
7340
7341   /* Then, detect eol-format if necessary.  */
7342   {
7343     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7344     Lisp_Object tail;
7345
7346     if (VECTORP (eol_type))
7347       {
7348         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7349           normal_eol = detect_eol (coding.source, src_bytes,
7350                                    coding_category_raw_text);
7351         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7352                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7353           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7354                                       coding_category_utf_16_be);
7355         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7356                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7357           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7358                                       coding_category_utf_16_le);
7359       }
7360     else
7361       {
7362         if (EQ (eol_type, Qunix))
7363           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7364         else if (EQ (eol_type, Qdos))
7365           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7366         else
7367           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7368       }
7369
7370     for (tail = val; CONSP (tail); tail = XCDR (tail))
7371       {
7372         enum coding_category category;
7373         int this_eol;
7374
7375         id = XINT (XCAR (tail));
7376         attrs = CODING_ID_ATTRS (id);
7377         category = XINT (CODING_ATTR_CATEGORY (attrs));
7378         eol_type = CODING_ID_EOL_TYPE (id);
7379         if (VECTORP (eol_type))
7380           {
7381             if (category == coding_category_utf_16_be
7382                 || category == coding_category_utf_16_be_nosig)
7383               this_eol = utf_16_be_eol;
7384             else if (category == coding_category_utf_16_le
7385                      || category == coding_category_utf_16_le_nosig)
7386               this_eol = utf_16_le_eol;
7387             else
7388               this_eol = normal_eol;
7389
7390             if (this_eol == EOL_SEEN_LF)
7391               XSETCAR (tail, AREF (eol_type, 0));
7392             else if (this_eol == EOL_SEEN_CRLF)
7393               XSETCAR (tail, AREF (eol_type, 1));
7394             else if (this_eol == EOL_SEEN_CR)
7395               XSETCAR (tail, AREF (eol_type, 2));
7396             else
7397               XSETCAR (tail, CODING_ID_NAME (id));
7398           }
7399         else
7400           XSETCAR (tail, CODING_ID_NAME (id));
7401       }
7402   }
7403
7404   return (highest ? XCAR (val) : val);
7405 }
7406
7407
7408 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7409        2, 3, 0,
7410        doc: /* Detect coding system of the text in the region between START and END.
7411 Return a list of possible coding systems ordered by priority.
7412
7413 If only ASCII characters are found, it returns a list of single element
7414 `undecided' or its subsidiary coding system according to a detected
7415 end-of-line format.
7416
7417 If optional argument HIGHEST is non-nil, return the coding system of
7418 highest priority.  */)
7419      (start, end, highest)
7420      Lisp_Object start, end, highest;
7421 {
7422   int from, to;
7423   int from_byte, to_byte;
7424
7425   CHECK_NUMBER_COERCE_MARKER (start);
7426   CHECK_NUMBER_COERCE_MARKER (end);
7427
7428   validate_region (&start, &end);
7429   from = XINT (start), to = XINT (end);
7430   from_byte = CHAR_TO_BYTE (from);
7431   to_byte = CHAR_TO_BYTE (to);
7432
7433   if (from < GPT && to >= GPT)
7434     move_gap_both (to, to_byte);
7435
7436   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7437                                to - from, to_byte - from_byte,
7438                                !NILP (highest),
7439                                !NILP (current_buffer
7440                                       ->enable_multibyte_characters),
7441                                Qnil);
7442 }
7443
7444 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7445        1, 2, 0,
7446        doc: /* Detect coding system of the text in STRING.
7447 Return a list of possible coding systems ordered by priority.
7448
7449 If only ASCII characters are found, it returns a list of single element
7450 `undecided' or its subsidiary coding system according to a detected
7451 end-of-line format.
7452
7453 If optional argument HIGHEST is non-nil, return the coding system of
7454 highest priority.  */)
7455      (string, highest)
7456      Lisp_Object string, highest;
7457 {
7458   CHECK_STRING (string);
7459
7460   return detect_coding_system (SDATA (string),
7461                                SCHARS (string), SBYTES (string),
7462                                !NILP (highest), STRING_MULTIBYTE (string),
7463                                Qnil);
7464 }
7465
7466
7467 static INLINE int
7468 char_encodable_p (c, attrs)
7469      int c;
7470      Lisp_Object attrs;
7471 {
7472   Lisp_Object tail;
7473   struct charset *charset;
7474   Lisp_Object translation_table;
7475
7476   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7477   if (! NILP (translation_table))
7478     c = translate_char (translation_table, c);
7479   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7480        CONSP (tail); tail = XCDR (tail))
7481     {
7482       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7483       if (CHAR_CHARSET_P (c, charset))
7484         break;
7485     }
7486   return (! NILP (tail));
7487 }
7488
7489
7490 /* Return a list of coding systems that safely encode the text between
7491    START and END.  If EXCLUDE is non-nil, it is a list of coding
7492    systems not to check.  The returned list doesn't contain any such
7493    coding systems.  In any case, if the text contains only ASCII or is
7494    unibyte, return t.  */
7495
7496 DEFUN ("find-coding-systems-region-internal",
7497        Ffind_coding_systems_region_internal,
7498        Sfind_coding_systems_region_internal, 2, 3, 0,
7499        doc: /* Internal use only.  */)
7500      (start, end, exclude)
7501      Lisp_Object start, end, exclude;
7502 {
7503   Lisp_Object coding_attrs_list, safe_codings;
7504   EMACS_INT start_byte, end_byte;
7505   const unsigned char *p, *pbeg, *pend;
7506   int c;
7507   Lisp_Object tail, elt;
7508
7509   if (STRINGP (start))
7510     {
7511       if (!STRING_MULTIBYTE (start)
7512           || SCHARS (start) == SBYTES (start))
7513         return Qt;
7514       start_byte = 0;
7515       end_byte = SBYTES (start);
7516     }
7517   else
7518     {
7519       CHECK_NUMBER_COERCE_MARKER (start);
7520       CHECK_NUMBER_COERCE_MARKER (end);
7521       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7522         args_out_of_range (start, end);
7523       if (NILP (current_buffer->enable_multibyte_characters))
7524         return Qt;
7525       start_byte = CHAR_TO_BYTE (XINT (start));
7526       end_byte = CHAR_TO_BYTE (XINT (end));
7527       if (XINT (end) - XINT (start) == end_byte - start_byte)
7528         return Qt;
7529
7530       if (XINT (start) < GPT && XINT (end) > GPT)
7531         {
7532           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7533             move_gap_both (XINT (start), start_byte);
7534           else
7535             move_gap_both (XINT (end), end_byte);
7536         }
7537     }
7538
7539   coding_attrs_list = Qnil;
7540   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7541     if (NILP (exclude)
7542         || NILP (Fmemq (XCAR (tail), exclude)))
7543       {
7544         Lisp_Object attrs;
7545
7546         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7547         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7548             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7549           {
7550             ASET (attrs, coding_attr_trans_tbl,
7551                   get_translation_table (attrs, 1, NULL));
7552             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7553           }
7554       }
7555
7556   if (STRINGP (start))
7557     p = pbeg = SDATA (start);
7558   else
7559     p = pbeg = BYTE_POS_ADDR (start_byte);
7560   pend = p + (end_byte - start_byte);
7561
7562   while (p < pend && ASCII_BYTE_P (*p)) p++;
7563   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7564
7565   while (p < pend)
7566     {
7567       if (ASCII_BYTE_P (*p))
7568         p++;
7569       else
7570         {
7571           c = STRING_CHAR_ADVANCE (p);
7572
7573           charset_map_loaded = 0;
7574           for (tail = coding_attrs_list; CONSP (tail);)
7575             {
7576               elt = XCAR (tail);
7577               if (NILP (elt))
7578                 tail = XCDR (tail);
7579               else if (char_encodable_p (c, elt))
7580                 tail = XCDR (tail);
7581               else if (CONSP (XCDR (tail)))
7582                 {
7583                   XSETCAR (tail, XCAR (XCDR (tail)));
7584                   XSETCDR (tail, XCDR (XCDR (tail)));
7585                 }
7586               else
7587                 {
7588                   XSETCAR (tail, Qnil);
7589                   tail = XCDR (tail);
7590                 }
7591             }
7592           if (charset_map_loaded)
7593             {
7594               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7595
7596               if (STRINGP (start))
7597                 pbeg = SDATA (start);
7598               else
7599                 pbeg = BYTE_POS_ADDR (start_byte);
7600               p = pbeg + p_offset;
7601               pend = pbeg + pend_offset;
7602             }
7603         }
7604     }
7605
7606   safe_codings = list2 (Qraw_text, Qno_conversion);
7607   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7608     if (! NILP (XCAR (tail)))
7609       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7610
7611   return safe_codings;
7612 }
7613
7614
7615 DEFUN ("unencodable-char-position", Funencodable_char_position,
7616        Sunencodable_char_position, 3, 5, 0,
7617        doc: /*
7618 Return position of first un-encodable character in a region.
7619 START and END specfiy the region and CODING-SYSTEM specifies the
7620 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7621
7622 If optional 4th argument COUNT is non-nil, it specifies at most how
7623 many un-encodable characters to search.  In this case, the value is a
7624 list of positions.
7625
7626 If optional 5th argument STRING is non-nil, it is a string to search
7627 for un-encodable characters.  In that case, START and END are indexes
7628 to the string.  */)
7629      (start, end, coding_system, count, string)
7630      Lisp_Object start, end, coding_system, count, string;
7631 {
7632   int n;
7633   struct coding_system coding;
7634   Lisp_Object attrs, charset_list, translation_table;
7635   Lisp_Object positions;
7636   int from, to;
7637   const unsigned char *p, *stop, *pend;
7638   int ascii_compatible;
7639
7640   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7641   attrs = CODING_ID_ATTRS (coding.id);
7642   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7643     return Qnil;
7644   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7645   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7646   translation_table = get_translation_table (attrs, 1, NULL);
7647
7648   if (NILP (string))
7649     {
7650       validate_region (&start, &end);
7651       from = XINT (start);
7652       to = XINT (end);
7653       if (NILP (current_buffer->enable_multibyte_characters)
7654           || (ascii_compatible
7655               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7656         return Qnil;
7657       p = CHAR_POS_ADDR (from);
7658       pend = CHAR_POS_ADDR (to);
7659       if (from < GPT && to >= GPT)
7660         stop = GPT_ADDR;
7661       else
7662         stop = pend;
7663     }
7664   else
7665     {
7666       CHECK_STRING (string);
7667       CHECK_NATNUM (start);
7668       CHECK_NATNUM (end);
7669       from = XINT (start);
7670       to = XINT (end);
7671       if (from > to
7672           || to > SCHARS (string))
7673         args_out_of_range_3 (string, start, end);
7674       if (! STRING_MULTIBYTE (string))
7675         return Qnil;
7676       p = SDATA (string) + string_char_to_byte (string, from);
7677       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7678       if (ascii_compatible && (to - from) == (pend - p))
7679         return Qnil;
7680     }
7681
7682   if (NILP (count))
7683     n = 1;
7684   else
7685     {
7686       CHECK_NATNUM (count);
7687       n = XINT (count);
7688     }
7689
7690   positions = Qnil;
7691   while (1)
7692     {
7693       int c;
7694
7695       if (ascii_compatible)
7696         while (p < stop && ASCII_BYTE_P (*p))
7697           p++, from++;
7698       if (p >= stop)
7699         {
7700           if (p >= pend)
7701             break;
7702           stop = pend;
7703           p = GAP_END_ADDR;
7704         }
7705
7706       c = STRING_CHAR_ADVANCE (p);
7707       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7708           && ! char_charset (translate_char (translation_table, c),
7709                              charset_list, NULL))
7710         {
7711           positions = Fcons (make_number (from), positions);
7712           n--;
7713           if (n == 0)
7714             break;
7715         }
7716
7717       from++;
7718     }
7719
7720   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7721 }
7722
7723
7724 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7725        Scheck_coding_systems_region, 3, 3, 0,
7726        doc: /* Check if the region is encodable by coding systems.
7727
7728 START and END are buffer positions specifying the region.
7729 CODING-SYSTEM-LIST is a list of coding systems to check.
7730
7731 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7732 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7733 whole region, POS0, POS1, ... are buffer positions where non-encodable
7734 characters are found.
7735
7736 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7737 value is nil.
7738
7739 START may be a string.  In that case, check if the string is
7740 encodable, and the value contains indices to the string instead of
7741 buffer positions.  END is ignored.  */)
7742      (start, end, coding_system_list)
7743      Lisp_Object start, end, coding_system_list;
7744 {
7745   Lisp_Object list;
7746   EMACS_INT start_byte, end_byte;
7747   int pos;
7748   const unsigned char *p, *pbeg, *pend;
7749   int c;
7750   Lisp_Object tail, elt, attrs;
7751
7752   if (STRINGP (start))
7753     {
7754       if (!STRING_MULTIBYTE (start)
7755           && SCHARS (start) != SBYTES (start))
7756         return Qnil;
7757       start_byte = 0;
7758       end_byte = SBYTES (start);
7759       pos = 0;
7760     }
7761   else
7762     {
7763       CHECK_NUMBER_COERCE_MARKER (start);
7764       CHECK_NUMBER_COERCE_MARKER (end);
7765       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7766         args_out_of_range (start, end);
7767       if (NILP (current_buffer->enable_multibyte_characters))
7768         return Qnil;
7769       start_byte = CHAR_TO_BYTE (XINT (start));
7770       end_byte = CHAR_TO_BYTE (XINT (end));
7771       if (XINT (end) - XINT (start) == end_byte - start_byte)
7772         return Qt;
7773
7774       if (XINT (start) < GPT && XINT (end) > GPT)
7775         {
7776           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7777             move_gap_both (XINT (start), start_byte);
7778           else
7779             move_gap_both (XINT (end), end_byte);
7780         }
7781       pos = XINT (start);
7782     }
7783
7784   list = Qnil;
7785   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7786     {
7787       elt = XCAR (tail);
7788       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7789       ASET (attrs, coding_attr_trans_tbl,
7790             get_translation_table (attrs, 1, NULL));
7791       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7792     }
7793
7794   if (STRINGP (start))
7795     p = pbeg = SDATA (start);
7796   else
7797     p = pbeg = BYTE_POS_ADDR (start_byte);
7798   pend = p + (end_byte - start_byte);
7799
7800   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7801   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7802
7803   while (p < pend)
7804     {
7805       if (ASCII_BYTE_P (*p))
7806         p++;
7807       else
7808         {
7809           c = STRING_CHAR_ADVANCE (p);
7810
7811           charset_map_loaded = 0;
7812           for (tail = list; CONSP (tail); tail = XCDR (tail))
7813             {
7814               elt = XCDR (XCAR (tail));
7815               if (! char_encodable_p (c, XCAR (elt)))
7816                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7817             }
7818           if (charset_map_loaded)
7819             {
7820               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7821
7822               if (STRINGP (start))
7823                 pbeg = SDATA (start);
7824               else
7825                 pbeg = BYTE_POS_ADDR (start_byte);
7826               p = pbeg + p_offset;
7827               pend = pbeg + pend_offset;
7828             }
7829         }
7830       pos++;
7831     }
7832
7833   tail = list;
7834   list = Qnil;
7835   for (; CONSP (tail); tail = XCDR (tail))
7836     {
7837       elt = XCAR (tail);
7838       if (CONSP (XCDR (XCDR (elt))))
7839         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7840                       list);
7841     }
7842
7843   return list;
7844 }
7845
7846
7847 Lisp_Object
7848 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7849      Lisp_Object start, end, coding_system, dst_object;
7850      int encodep, norecord;
7851 {
7852   struct coding_system coding;
7853   EMACS_INT from, from_byte, to, to_byte;
7854   Lisp_Object src_object;
7855
7856   CHECK_NUMBER_COERCE_MARKER (start);
7857   CHECK_NUMBER_COERCE_MARKER (end);
7858   if (NILP (coding_system))
7859     coding_system = Qno_conversion;
7860   else
7861     CHECK_CODING_SYSTEM (coding_system);
7862   src_object = Fcurrent_buffer ();
7863   if (NILP (dst_object))
7864     dst_object = src_object;
7865   else if (! EQ (dst_object, Qt))
7866     CHECK_BUFFER (dst_object);
7867
7868   validate_region (&start, &end);
7869   from = XFASTINT (start);
7870   from_byte = CHAR_TO_BYTE (from);
7871   to = XFASTINT (end);
7872   to_byte = CHAR_TO_BYTE (to);
7873
7874   setup_coding_system (coding_system, &coding);
7875   coding.mode |= CODING_MODE_LAST_BLOCK;
7876
7877   if (encodep)
7878     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7879                           dst_object);
7880   else
7881     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7882                           dst_object);
7883   if (! norecord)
7884     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7885
7886   return (BUFFERP (dst_object)
7887           ? make_number (coding.produced_char)
7888           : coding.dst_object);
7889 }
7890
7891
7892 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7893        3, 4, "r\nzCoding system: ",
7894        doc: /* Decode the current region from the specified coding system.
7895 When called from a program, takes four arguments:
7896         START, END, CODING-SYSTEM, and DESTINATION.
7897 START and END are buffer positions.
7898
7899 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7900 If nil, the region between START and END is replace by the decoded text.
7901 If buffer, the decoded text is inserted in the buffer.
7902 If t, the decoded text is returned.
7903
7904 This function sets `last-coding-system-used' to the precise coding system
7905 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7906 not fully specified.)
7907 It returns the length of the decoded text.  */)
7908      (start, end, coding_system, destination)
7909      Lisp_Object start, end, coding_system, destination;
7910 {
7911   return code_convert_region (start, end, coding_system, destination, 0, 0);
7912 }
7913
7914 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7915        3, 4, "r\nzCoding system: ",
7916        doc: /* Encode the current region by specified coding system.
7917 When called from a program, takes three arguments:
7918 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7919
7920 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7921 If nil, the region between START and END is replace by the encoded text.
7922 If buffer, the encoded text is inserted in the buffer.
7923 If t, the encoded text is returned.
7924
7925 This function sets `last-coding-system-used' to the precise coding system
7926 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7927 not fully specified.)
7928 It returns the length of the encoded text.  */)
7929   (start, end, coding_system, destination)
7930      Lisp_Object start, end, coding_system, destination;
7931 {
7932   return code_convert_region (start, end, coding_system, destination, 1, 0);
7933 }
7934
7935 Lisp_Object
7936 code_convert_string (string, coding_system, dst_object,
7937                      encodep, nocopy, norecord)
7938      Lisp_Object string, coding_system, dst_object;
7939      int encodep, nocopy, norecord;
7940 {
7941   struct coding_system coding;
7942   EMACS_INT chars, bytes;
7943
7944   CHECK_STRING (string);
7945   if (NILP (coding_system))
7946     {
7947       if (! norecord)
7948         Vlast_coding_system_used = Qno_conversion;
7949       if (NILP (dst_object))
7950         return (nocopy ? Fcopy_sequence (string) : string);
7951     }
7952
7953   if (NILP (coding_system))
7954     coding_system = Qno_conversion;
7955   else
7956     CHECK_CODING_SYSTEM (coding_system);
7957   if (NILP (dst_object))
7958     dst_object = Qt;
7959   else if (! EQ (dst_object, Qt))
7960     CHECK_BUFFER (dst_object);
7961
7962   setup_coding_system (coding_system, &coding);
7963   coding.mode |= CODING_MODE_LAST_BLOCK;
7964   chars = SCHARS (string);
7965   bytes = SBYTES (string);
7966   if (encodep)
7967     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7968   else
7969     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7970   if (! norecord)
7971     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7972
7973   return (BUFFERP (dst_object)
7974           ? make_number (coding.produced_char)
7975           : coding.dst_object);
7976 }
7977
7978
7979 /* Encode or decode STRING according to CODING_SYSTEM.
7980    Do not set Vlast_coding_system_used.
7981
7982    This function is called only from macros DECODE_FILE and
7983    ENCODE_FILE, thus we ignore character composition.  */
7984
7985 Lisp_Object
7986 code_convert_string_norecord (string, coding_system, encodep)
7987      Lisp_Object string, coding_system;
7988      int encodep;
7989 {
7990   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7991 }
7992
7993
7994 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7995        2, 4, 0,
7996        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7997
7998 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7999 if the decoding operation is trivial.
8000
8001 Optional fourth arg BUFFER non-nil meant that the decoded text is
8002 inserted in BUFFER instead of returned as a string.  In this case,
8003 the return value is BUFFER.
8004
8005 This function sets `last-coding-system-used' to the precise coding system
8006 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8007 not fully specified.  */)
8008   (string, coding_system, nocopy, buffer)
8009      Lisp_Object string, coding_system, nocopy, buffer;
8010 {
8011   return code_convert_string (string, coding_system, buffer,
8012                               0, ! NILP (nocopy), 0);
8013 }
8014
8015 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8016        2, 4, 0,
8017        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8018
8019 Optional third arg NOCOPY non-nil means it is OK to return STRING
8020 itself if the encoding operation is trivial.
8021
8022 Optional fourth arg BUFFER non-nil meant that the encoded text is
8023 inserted in BUFFER instead of returned as a string.  In this case,
8024 the return value is BUFFER.
8025
8026 This function sets `last-coding-system-used' to the precise coding system
8027 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8028 not fully specified.)  */)
8029      (string, coding_system, nocopy, buffer)
8030      Lisp_Object string, coding_system, nocopy, buffer;
8031 {
8032   return code_convert_string (string, coding_system, buffer,
8033                               1, ! NILP (nocopy), 1);
8034 }
8035
8036 \f
8037 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8038        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8039 Return the corresponding character.  */)
8040      (code)
8041      Lisp_Object code;
8042 {
8043   Lisp_Object spec, attrs, val;
8044   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8045   int c;
8046
8047   CHECK_NATNUM (code);
8048   c = XFASTINT (code);
8049   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8050   attrs = AREF (spec, 0);
8051
8052   if (ASCII_BYTE_P (c)
8053       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8054     return code;
8055
8056   val = CODING_ATTR_CHARSET_LIST (attrs);
8057   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8058   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8059   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8060
8061   if (c <= 0x7F)
8062     charset = charset_roman;
8063   else if (c >= 0xA0 && c < 0xDF)
8064     {
8065       charset = charset_kana;
8066       c -= 0x80;
8067     }
8068   else
8069     {
8070       int s1 = c >> 8, s2 = c & 0xFF;
8071
8072       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8073           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8074         error ("Invalid code: %d", code);
8075       SJIS_TO_JIS (c);
8076       charset = charset_kanji;
8077     }
8078   c = DECODE_CHAR (charset, c);
8079   if (c < 0)
8080     error ("Invalid code: %d", code);
8081   return make_number (c);
8082 }
8083
8084
8085 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8086        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8087 Return the corresponding code in SJIS.  */)
8088      (ch)
8089     Lisp_Object ch;
8090 {
8091   Lisp_Object spec, attrs, charset_list;
8092   int c;
8093   struct charset *charset;
8094   unsigned code;
8095
8096   CHECK_CHARACTER (ch);
8097   c = XFASTINT (ch);
8098   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8099   attrs = AREF (spec, 0);
8100
8101   if (ASCII_CHAR_P (c)
8102       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8103     return ch;
8104
8105   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8106   charset = char_charset (c, charset_list, &code);
8107   if (code == CHARSET_INVALID_CODE (charset))
8108     error ("Can't encode by shift_jis encoding: %d", c);
8109   JIS_TO_SJIS (code);
8110
8111   return make_number (code);
8112 }
8113
8114 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8115        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8116 Return the corresponding character.  */)
8117      (code)
8118      Lisp_Object code;
8119 {
8120   Lisp_Object spec, attrs, val;
8121   struct charset *charset_roman, *charset_big5, *charset;
8122   int c;
8123
8124   CHECK_NATNUM (code);
8125   c = XFASTINT (code);
8126   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8127   attrs = AREF (spec, 0);
8128
8129   if (ASCII_BYTE_P (c)
8130       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8131     return code;
8132
8133   val = CODING_ATTR_CHARSET_LIST (attrs);
8134   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8135   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8136
8137   if (c <= 0x7F)
8138     charset = charset_roman;
8139   else
8140     {
8141       int b1 = c >> 8, b2 = c & 0x7F;
8142       if (b1 < 0xA1 || b1 > 0xFE
8143           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8144         error ("Invalid code: %d", code);
8145       charset = charset_big5;
8146     }
8147   c = DECODE_CHAR (charset, (unsigned )c);
8148   if (c < 0)
8149     error ("Invalid code: %d", code);
8150   return make_number (c);
8151 }
8152
8153 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8154        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8155 Return the corresponding character code in Big5.  */)
8156      (ch)
8157      Lisp_Object ch;
8158 {
8159   Lisp_Object spec, attrs, charset_list;
8160   struct charset *charset;
8161   int c;
8162   unsigned code;
8163
8164   CHECK_CHARACTER (ch);
8165   c = XFASTINT (ch);
8166   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8167   attrs = AREF (spec, 0);
8168   if (ASCII_CHAR_P (c)
8169       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8170     return ch;
8171
8172   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8173   charset = char_charset (c, charset_list, &code);
8174   if (code == CHARSET_INVALID_CODE (charset))
8175     error ("Can't encode by Big5 encoding: %d", c);
8176
8177   return make_number (code);
8178 }
8179
8180 \f
8181 DEFUN ("set-terminal-coding-system-internal",
8182        Fset_terminal_coding_system_internal,
8183        Sset_terminal_coding_system_internal, 1, 1, 0,
8184        doc: /* Internal use only.  */)
8185      (coding_system)
8186      Lisp_Object coding_system;
8187 {
8188   CHECK_SYMBOL (coding_system);
8189   setup_coding_system (Fcheck_coding_system (coding_system),
8190                         &terminal_coding);
8191
8192   /* We had better not send unsafe characters to terminal.  */
8193   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8194   /* Characer composition should be disabled.  */
8195   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8196   terminal_coding.src_multibyte = 1;
8197   terminal_coding.dst_multibyte = 0;
8198   return Qnil;
8199 }
8200
8201 DEFUN ("set-safe-terminal-coding-system-internal",
8202        Fset_safe_terminal_coding_system_internal,
8203        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8204        doc: /* Internal use only.  */)
8205      (coding_system)
8206      Lisp_Object coding_system;
8207 {
8208   CHECK_SYMBOL (coding_system);
8209   setup_coding_system (Fcheck_coding_system (coding_system),
8210                        &safe_terminal_coding);
8211   /* Characer composition should be disabled.  */
8212   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8213   safe_terminal_coding.src_multibyte = 1;
8214   safe_terminal_coding.dst_multibyte = 0;
8215   return Qnil;
8216 }
8217
8218 DEFUN ("terminal-coding-system",
8219        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8220        doc: /* Return coding system specified for terminal output.  */)
8221      ()
8222 {
8223   Lisp_Object coding_system;
8224
8225   coding_system = CODING_ID_NAME (terminal_coding.id);
8226   /* For backward compatibility, return nil if it is `undecided'. */
8227   return (coding_system != Qundecided ? coding_system : Qnil);
8228 }
8229
8230 DEFUN ("set-keyboard-coding-system-internal",
8231        Fset_keyboard_coding_system_internal,
8232        Sset_keyboard_coding_system_internal, 1, 1, 0,
8233        doc: /* Internal use only.  */)
8234      (coding_system)
8235      Lisp_Object coding_system;
8236 {
8237   CHECK_SYMBOL (coding_system);
8238   setup_coding_system (Fcheck_coding_system (coding_system),
8239                        &keyboard_coding);
8240   /* Characer composition should be disabled.  */
8241   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8242   return Qnil;
8243 }
8244
8245 DEFUN ("keyboard-coding-system",
8246        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8247        doc: /* Return coding system specified for decoding keyboard input.  */)
8248      ()
8249 {
8250   return CODING_ID_NAME (keyboard_coding.id);
8251 }
8252
8253 \f
8254 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8255        Sfind_operation_coding_system,  1, MANY, 0,
8256        doc: /* Choose a coding system for an operation based on the target name.
8257 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8258 DECODING-SYSTEM is the coding system to use for decoding
8259 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8260 for encoding (in case OPERATION does encoding).
8261
8262 The first argument OPERATION specifies an I/O primitive:
8263   For file I/O, `insert-file-contents' or `write-region'.
8264   For process I/O, `call-process', `call-process-region', or `start-process'.
8265   For network I/O, `open-network-stream'.
8266
8267 The remaining arguments should be the same arguments that were passed
8268 to the primitive.  Depending on which primitive, one of those arguments
8269 is selected as the TARGET.  For example, if OPERATION does file I/O,
8270 whichever argument specifies the file name is TARGET.
8271
8272 TARGET has a meaning which depends on OPERATION:
8273   For file I/O, TARGET is a file name.
8274   For process I/O, TARGET is a process name.
8275   For network I/O, TARGET is a service name or a port number
8276
8277 This function looks up what specified for TARGET in,
8278 `file-coding-system-alist', `process-coding-system-alist',
8279 or `network-coding-system-alist' depending on OPERATION.
8280 They may specify a coding system, a cons of coding systems,
8281 or a function symbol to call.
8282 In the last case, we call the function with one argument,
8283 which is a list of all the arguments given to this function.
8284
8285 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8286      (nargs, args)
8287      int nargs;
8288      Lisp_Object *args;
8289 {
8290   Lisp_Object operation, target_idx, target, val;
8291   register Lisp_Object chain;
8292
8293   if (nargs < 2)
8294     error ("Too few arguments");
8295   operation = args[0];
8296   if (!SYMBOLP (operation)
8297       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8298     error ("Invalid first arguement");
8299   if (nargs < 1 + XINT (target_idx))
8300     error ("Too few arguments for operation: %s",
8301            SDATA (SYMBOL_NAME (operation)));
8302   target = args[XINT (target_idx) + 1];
8303   if (!(STRINGP (target)
8304         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8305     error ("Invalid %dth argument", XINT (target_idx) + 1);
8306
8307   chain = ((EQ (operation, Qinsert_file_contents)
8308             || EQ (operation, Qwrite_region))
8309            ? Vfile_coding_system_alist
8310            : (EQ (operation, Qopen_network_stream)
8311               ? Vnetwork_coding_system_alist
8312               : Vprocess_coding_system_alist));
8313   if (NILP (chain))
8314     return Qnil;
8315
8316   for (; CONSP (chain); chain = XCDR (chain))
8317     {
8318       Lisp_Object elt;
8319
8320       elt = XCAR (chain);
8321       if (CONSP (elt)
8322           && ((STRINGP (target)
8323                && STRINGP (XCAR (elt))
8324                && fast_string_match (XCAR (elt), target) >= 0)
8325               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8326         {
8327           val = XCDR (elt);
8328           /* Here, if VAL is both a valid coding system and a valid
8329              function symbol, we return VAL as a coding system.  */
8330           if (CONSP (val))
8331             return val;
8332           if (! SYMBOLP (val))
8333             return Qnil;
8334           if (! NILP (Fcoding_system_p (val)))
8335             return Fcons (val, val);
8336           if (! NILP (Ffboundp (val)))
8337             {
8338               val = call1 (val, Flist (nargs, args));
8339               if (CONSP (val))
8340                 return val;
8341               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8342                 return Fcons (val, val);
8343             }
8344           return Qnil;
8345         }
8346     }
8347   return Qnil;
8348 }
8349
8350 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8351        Sset_coding_system_priority, 0, MANY, 0,
8352        doc: /* Assign higher priority to the coding systems given as arguments.
8353 If multiple coding systems belongs to the same category,
8354 all but the first one are ignored.
8355
8356 usage: (set-coding-system-priority ...)  */)
8357      (nargs, args)
8358      int nargs;
8359      Lisp_Object *args;
8360 {
8361   int i, j;
8362   int changed[coding_category_max];
8363   enum coding_category priorities[coding_category_max];
8364
8365   bzero (changed, sizeof changed);
8366
8367   for (i = j = 0; i < nargs; i++)
8368     {
8369       enum coding_category category;
8370       Lisp_Object spec, attrs;
8371
8372       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8373       attrs = AREF (spec, 0);
8374       category = XINT (CODING_ATTR_CATEGORY (attrs));
8375       if (changed[category])
8376         /* Ignore this coding system because a coding system of the
8377            same category already had a higher priority.  */
8378         continue;
8379       changed[category] = 1;
8380       priorities[j++] = category;
8381       if (coding_categories[category].id >= 0
8382           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8383         setup_coding_system (args[i], &coding_categories[category]);
8384       Fset (AREF (Vcoding_category_table, category), args[i]);
8385     }
8386
8387   /* Now we have decided top J priorities.  Reflect the order of the
8388      original priorities to the remaining priorities.  */
8389
8390   for (i = j, j = 0; i < coding_category_max; i++, j++)
8391     {
8392       while (j < coding_category_max
8393              && changed[coding_priorities[j]])
8394         j++;
8395       if (j == coding_category_max)
8396         abort ();
8397       priorities[i] = coding_priorities[j];
8398     }
8399
8400   bcopy (priorities, coding_priorities, sizeof priorities);
8401
8402   /* Update `coding-category-list'.  */
8403   Vcoding_category_list = Qnil;
8404   for (i = coding_category_max - 1; i >= 0; i--)
8405     Vcoding_category_list
8406       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8407                Vcoding_category_list);
8408
8409   return Qnil;
8410 }
8411
8412 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8413        Scoding_system_priority_list, 0, 1, 0,
8414        doc: /* Return a list of coding systems ordered by their priorities.
8415 HIGHESTP non-nil means just return the highest priority one.  */)
8416      (highestp)
8417      Lisp_Object highestp;
8418 {
8419   int i;
8420   Lisp_Object val;
8421
8422   for (i = 0, val = Qnil; i < coding_category_max; i++)
8423     {
8424       enum coding_category category = coding_priorities[i];
8425       int id = coding_categories[category].id;
8426       Lisp_Object attrs;
8427
8428       if (id < 0)
8429         continue;
8430       attrs = CODING_ID_ATTRS (id);
8431       if (! NILP (highestp))
8432         return CODING_ATTR_BASE_NAME (attrs);
8433       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8434     }
8435   return Fnreverse (val);
8436 }
8437
8438 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8439
8440 static Lisp_Object
8441 make_subsidiaries (base)
8442      Lisp_Object base;
8443 {
8444   Lisp_Object subsidiaries;
8445   int base_name_len = SBYTES (SYMBOL_NAME (base));
8446   char *buf = (char *) alloca (base_name_len + 6);
8447   int i;
8448
8449   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8450   subsidiaries = Fmake_vector (make_number (3), Qnil);
8451   for (i = 0; i < 3; i++)
8452     {
8453       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8454       ASET (subsidiaries, i, intern (buf));
8455     }
8456   return subsidiaries;
8457 }
8458
8459
8460 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8461        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8462        doc: /* For internal use only.
8463 usage: (define-coding-system-internal ...)  */)
8464      (nargs, args)
8465      int nargs;
8466      Lisp_Object *args;
8467 {
8468   Lisp_Object name;
8469   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8470   Lisp_Object attrs;            /* Vector of attributes.  */
8471   Lisp_Object eol_type;
8472   Lisp_Object aliases;
8473   Lisp_Object coding_type, charset_list, safe_charsets;
8474   enum coding_category category;
8475   Lisp_Object tail, val;
8476   int max_charset_id = 0;
8477   int i;
8478
8479   if (nargs < coding_arg_max)
8480     goto short_args;
8481
8482   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8483
8484   name = args[coding_arg_name];
8485   CHECK_SYMBOL (name);
8486   CODING_ATTR_BASE_NAME (attrs) = name;
8487
8488   val = args[coding_arg_mnemonic];
8489   if (! STRINGP (val))
8490     CHECK_CHARACTER (val);
8491   CODING_ATTR_MNEMONIC (attrs) = val;
8492
8493   coding_type = args[coding_arg_coding_type];
8494   CHECK_SYMBOL (coding_type);
8495   CODING_ATTR_TYPE (attrs) = coding_type;
8496
8497   charset_list = args[coding_arg_charset_list];
8498   if (SYMBOLP (charset_list))
8499     {
8500       if (EQ (charset_list, Qiso_2022))
8501         {
8502           if (! EQ (coding_type, Qiso_2022))
8503             error ("Invalid charset-list");
8504           charset_list = Viso_2022_charset_list;
8505         }
8506       else if (EQ (charset_list, Qemacs_mule))
8507         {
8508           if (! EQ (coding_type, Qemacs_mule))
8509             error ("Invalid charset-list");
8510           charset_list = Vemacs_mule_charset_list;
8511         }
8512       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8513         if (max_charset_id < XFASTINT (XCAR (tail)))
8514           max_charset_id = XFASTINT (XCAR (tail));
8515     }
8516   else
8517     {
8518       charset_list = Fcopy_sequence (charset_list);
8519       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8520         {
8521           struct charset *charset;
8522
8523           val = Fcar (tail);
8524           CHECK_CHARSET_GET_CHARSET (val, charset);
8525           if (EQ (coding_type, Qiso_2022)
8526               ? CHARSET_ISO_FINAL (charset) < 0
8527               : EQ (coding_type, Qemacs_mule)
8528               ? CHARSET_EMACS_MULE_ID (charset) < 0
8529               : 0)
8530             error ("Can't handle charset `%s'",
8531                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8532
8533           XSETCAR (tail, make_number (charset->id));
8534           if (max_charset_id < charset->id)
8535             max_charset_id = charset->id;
8536         }
8537     }
8538   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8539
8540   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8541                                 make_number (255));
8542   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8543     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8544   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8545
8546   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8547
8548   val = args[coding_arg_decode_translation_table];
8549   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8550     CHECK_SYMBOL (val);
8551   CODING_ATTR_DECODE_TBL (attrs) = val;
8552
8553   val = args[coding_arg_encode_translation_table];
8554   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8555     CHECK_SYMBOL (val);
8556   CODING_ATTR_ENCODE_TBL (attrs) = val;
8557
8558   val = args[coding_arg_post_read_conversion];
8559   CHECK_SYMBOL (val);
8560   CODING_ATTR_POST_READ (attrs) = val;
8561
8562   val = args[coding_arg_pre_write_conversion];
8563   CHECK_SYMBOL (val);
8564   CODING_ATTR_PRE_WRITE (attrs) = val;
8565
8566   val = args[coding_arg_default_char];
8567   if (NILP (val))
8568     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8569   else
8570     {
8571       CHECK_CHARACTER (val);
8572       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8573     }
8574
8575   val = args[coding_arg_for_unibyte];
8576   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8577
8578   val = args[coding_arg_plist];
8579   CHECK_LIST (val);
8580   CODING_ATTR_PLIST (attrs) = val;
8581
8582   if (EQ (coding_type, Qcharset))
8583     {
8584       /* Generate a lisp vector of 256 elements.  Each element is nil,
8585          integer, or a list of charset IDs.
8586
8587          If Nth element is nil, the byte code N is invalid in this
8588          coding system.
8589
8590          If Nth element is a number NUM, N is the first byte of a
8591          charset whose ID is NUM.
8592
8593          If Nth element is a list of charset IDs, N is the first byte
8594          of one of them.  The list is sorted by dimensions of the
8595          charsets.  A charset of smaller dimension comes firtst. */
8596       val = Fmake_vector (make_number (256), Qnil);
8597
8598       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8599         {
8600           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8601           int dim = CHARSET_DIMENSION (charset);
8602           int idx = (dim - 1) * 4;
8603
8604           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8605             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8606
8607           for (i = charset->code_space[idx];
8608                i <= charset->code_space[idx + 1]; i++)
8609             {
8610               Lisp_Object tmp, tmp2;
8611               int dim2;
8612
8613               tmp = AREF (val, i);
8614               if (NILP (tmp))
8615                 tmp = XCAR (tail);
8616               else if (NUMBERP (tmp))
8617                 {
8618                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8619                   if (dim < dim2)
8620                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8621                   else
8622                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8623                 }
8624               else
8625                 {
8626                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8627                     {
8628                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8629                       if (dim < dim2)
8630                         break;
8631                     }
8632                   if (NILP (tmp2))
8633                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8634                   else
8635                     {
8636                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8637                       XSETCAR (tmp2, XCAR (tail));
8638                     }
8639                 }
8640               ASET (val, i, tmp);
8641             }
8642         }
8643       ASET (attrs, coding_attr_charset_valids, val);
8644       category = coding_category_charset;
8645     }
8646   else if (EQ (coding_type, Qccl))
8647     {
8648       Lisp_Object valids;
8649
8650       if (nargs < coding_arg_ccl_max)
8651         goto short_args;
8652
8653       val = args[coding_arg_ccl_decoder];
8654       CHECK_CCL_PROGRAM (val);
8655       if (VECTORP (val))
8656         val = Fcopy_sequence (val);
8657       ASET (attrs, coding_attr_ccl_decoder, val);
8658
8659       val = args[coding_arg_ccl_encoder];
8660       CHECK_CCL_PROGRAM (val);
8661       if (VECTORP (val))
8662         val = Fcopy_sequence (val);
8663       ASET (attrs, coding_attr_ccl_encoder, val);
8664
8665       val = args[coding_arg_ccl_valids];
8666       valids = Fmake_string (make_number (256), make_number (0));
8667       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8668         {
8669           int from, to;
8670
8671           val = Fcar (tail);
8672           if (INTEGERP (val))
8673             {
8674               from = to = XINT (val);
8675               if (from < 0 || from > 255)
8676                 args_out_of_range_3 (val, make_number (0), make_number (255));
8677             }
8678           else
8679             {
8680               CHECK_CONS (val);
8681               CHECK_NATNUM_CAR (val);
8682               CHECK_NATNUM_CDR (val);
8683               from = XINT (XCAR (val));
8684               if (from > 255)
8685                 args_out_of_range_3 (XCAR (val),
8686                                      make_number (0), make_number (255));
8687               to = XINT (XCDR (val));
8688               if (to < from || to > 255)
8689                 args_out_of_range_3 (XCDR (val),
8690                                      XCAR (val), make_number (255));
8691             }
8692           for (i = from; i <= to; i++)
8693             SSET (valids, i, 1);
8694         }
8695       ASET (attrs, coding_attr_ccl_valids, valids);
8696
8697       category = coding_category_ccl;
8698     }
8699   else if (EQ (coding_type, Qutf_16))
8700     {
8701       Lisp_Object bom, endian;
8702
8703       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8704
8705       if (nargs < coding_arg_utf16_max)
8706         goto short_args;
8707
8708       bom = args[coding_arg_utf16_bom];
8709       if (! NILP (bom) && ! EQ (bom, Qt))
8710         {
8711           CHECK_CONS (bom);
8712           val = XCAR (bom);
8713           CHECK_CODING_SYSTEM (val);
8714           val = XCDR (bom);
8715           CHECK_CODING_SYSTEM (val);
8716         }
8717       ASET (attrs, coding_attr_utf_16_bom, bom);
8718
8719       endian = args[coding_arg_utf16_endian];
8720       CHECK_SYMBOL (endian);
8721       if (NILP (endian))
8722         endian = Qbig;
8723       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8724         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8725       ASET (attrs, coding_attr_utf_16_endian, endian);
8726
8727       category = (CONSP (bom)
8728                   ? coding_category_utf_16_auto
8729                   : NILP (bom)
8730                   ? (EQ (endian, Qbig)
8731                      ? coding_category_utf_16_be_nosig
8732                      : coding_category_utf_16_le_nosig)
8733                   : (EQ (endian, Qbig)
8734                      ? coding_category_utf_16_be
8735                      : coding_category_utf_16_le));
8736     }
8737   else if (EQ (coding_type, Qiso_2022))
8738     {
8739       Lisp_Object initial, reg_usage, request, flags;
8740       int i;
8741
8742       if (nargs < coding_arg_iso2022_max)
8743         goto short_args;
8744
8745       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8746       CHECK_VECTOR (initial);
8747       for (i = 0; i < 4; i++)
8748         {
8749           val = Faref (initial, make_number (i));
8750           if (! NILP (val))
8751             {
8752               struct charset *charset;
8753
8754               CHECK_CHARSET_GET_CHARSET (val, charset);
8755               ASET (initial, i, make_number (CHARSET_ID (charset)));
8756               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8757                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8758             }
8759           else
8760             ASET (initial, i, make_number (-1));
8761         }
8762
8763       reg_usage = args[coding_arg_iso2022_reg_usage];
8764       CHECK_CONS (reg_usage);
8765       CHECK_NUMBER_CAR (reg_usage);
8766       CHECK_NUMBER_CDR (reg_usage);
8767
8768       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8769       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8770         {
8771           int id;
8772           Lisp_Object tmp;
8773
8774           val = Fcar (tail);
8775           CHECK_CONS (val);
8776           tmp = XCAR (val);
8777           CHECK_CHARSET_GET_ID (tmp, id);
8778           CHECK_NATNUM_CDR (val);
8779           if (XINT (XCDR (val)) >= 4)
8780             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8781           XSETCAR (val, make_number (id));
8782         }
8783
8784       flags = args[coding_arg_iso2022_flags];
8785       CHECK_NATNUM (flags);
8786       i = XINT (flags);
8787       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8788         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8789
8790       ASET (attrs, coding_attr_iso_initial, initial);
8791       ASET (attrs, coding_attr_iso_usage, reg_usage);
8792       ASET (attrs, coding_attr_iso_request, request);
8793       ASET (attrs, coding_attr_iso_flags, flags);
8794       setup_iso_safe_charsets (attrs);
8795
8796       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8797         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8798                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8799                     ? coding_category_iso_7_else
8800                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8801                     ? coding_category_iso_7
8802                     : coding_category_iso_7_tight);
8803       else
8804         {
8805           int id = XINT (AREF (initial, 1));
8806
8807           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8808                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8809                        || id < 0)
8810                       ? coding_category_iso_8_else
8811                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8812                       ? coding_category_iso_8_1
8813                       : coding_category_iso_8_2);
8814         }
8815       if (category != coding_category_iso_8_1
8816           && category != coding_category_iso_8_2)
8817         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8818     }
8819   else if (EQ (coding_type, Qemacs_mule))
8820     {
8821       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8822         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8823       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8824       category = coding_category_emacs_mule;
8825     }
8826   else if (EQ (coding_type, Qshift_jis))
8827     {
8828
8829       struct charset *charset;
8830
8831       if (XINT (Flength (charset_list)) != 3
8832           && XINT (Flength (charset_list)) != 4)
8833         error ("There should be three or four charsets");
8834
8835       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8836       if (CHARSET_DIMENSION (charset) != 1)
8837         error ("Dimension of charset %s is not one",
8838                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8839       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8840         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8841
8842       charset_list = XCDR (charset_list);
8843       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8844       if (CHARSET_DIMENSION (charset) != 1)
8845         error ("Dimension of charset %s is not one",
8846                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8847
8848       charset_list = XCDR (charset_list);
8849       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8850       if (CHARSET_DIMENSION (charset) != 2)
8851         error ("Dimension of charset %s is not two",
8852                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8853
8854       charset_list = XCDR (charset_list);
8855       if (! NILP (charset_list))
8856         {
8857           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8858           if (CHARSET_DIMENSION (charset) != 2)
8859             error ("Dimension of charset %s is not two",
8860                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8861         }
8862
8863       category = coding_category_sjis;
8864       Vsjis_coding_system = name;
8865     }
8866   else if (EQ (coding_type, Qbig5))
8867     {
8868       struct charset *charset;
8869
8870       if (XINT (Flength (charset_list)) != 2)
8871         error ("There should be just two charsets");
8872
8873       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8874       if (CHARSET_DIMENSION (charset) != 1)
8875         error ("Dimension of charset %s is not one",
8876                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8877       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8878         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8879
8880       charset_list = XCDR (charset_list);
8881       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8882       if (CHARSET_DIMENSION (charset) != 2)
8883         error ("Dimension of charset %s is not two",
8884                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8885
8886       category = coding_category_big5;
8887       Vbig5_coding_system = name;
8888     }
8889   else if (EQ (coding_type, Qraw_text))
8890     {
8891       category = coding_category_raw_text;
8892       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8893     }
8894   else if (EQ (coding_type, Qutf_8))
8895     {
8896       category = coding_category_utf_8;
8897       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8898     }
8899   else if (EQ (coding_type, Qundecided))
8900     category = coding_category_undecided;
8901   else
8902     error ("Invalid coding system type: %s",
8903            SDATA (SYMBOL_NAME (coding_type)));
8904
8905   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8906   CODING_ATTR_PLIST (attrs)
8907     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8908                                 CODING_ATTR_PLIST (attrs)));
8909   CODING_ATTR_PLIST (attrs)
8910     = Fcons (QCascii_compatible_p,
8911              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8912                     CODING_ATTR_PLIST (attrs)));
8913
8914   eol_type = args[coding_arg_eol_type];
8915   if (! NILP (eol_type)
8916       && ! EQ (eol_type, Qunix)
8917       && ! EQ (eol_type, Qdos)
8918       && ! EQ (eol_type, Qmac))
8919     error ("Invalid eol-type");
8920
8921   aliases = Fcons (name, Qnil);
8922
8923   if (NILP (eol_type))
8924     {
8925       eol_type = make_subsidiaries (name);
8926       for (i = 0; i < 3; i++)
8927         {
8928           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8929
8930           this_name = AREF (eol_type, i);
8931           this_aliases = Fcons (this_name, Qnil);
8932           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8933           this_spec = Fmake_vector (make_number (3), attrs);
8934           ASET (this_spec, 1, this_aliases);
8935           ASET (this_spec, 2, this_eol_type);
8936           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8937           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8938           val = Fassoc (Fsymbol_name (this_name), Vcoding_system_alist);
8939           if (NILP (val))
8940             Vcoding_system_alist
8941               = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8942                        Vcoding_system_alist);
8943         }
8944     }
8945
8946   spec_vec = Fmake_vector (make_number (3), attrs);
8947   ASET (spec_vec, 1, aliases);
8948   ASET (spec_vec, 2, eol_type);
8949
8950   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8951   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8952   val = Fassoc (Fsymbol_name (name), Vcoding_system_alist);
8953   if (NILP (val))
8954     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8955                                   Vcoding_system_alist);
8956
8957   {
8958     int id = coding_categories[category].id;
8959
8960     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8961       setup_coding_system (name, &coding_categories[category]);
8962   }
8963
8964   return Qnil;
8965
8966  short_args:
8967   return Fsignal (Qwrong_number_of_arguments,
8968                   Fcons (intern ("define-coding-system-internal"),
8969                          make_number (nargs)));
8970 }
8971
8972
8973 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8974        3, 3, 0,
8975        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8976   (coding_system, prop, val)
8977      Lisp_Object coding_system, prop, val;
8978 {
8979   Lisp_Object spec, attrs;
8980
8981   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8982   attrs = AREF (spec, 0);
8983   if (EQ (prop, QCmnemonic))
8984     {
8985       if (! STRINGP (val))
8986         CHECK_CHARACTER (val);
8987       CODING_ATTR_MNEMONIC (attrs) = val;
8988     }
8989   else if (EQ (prop, QCdefalut_char))
8990     {
8991       if (NILP (val))
8992         val = make_number (' ');
8993       else
8994         CHECK_CHARACTER (val);
8995       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8996     }
8997   else if (EQ (prop, QCdecode_translation_table))
8998     {
8999       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9000         CHECK_SYMBOL (val);
9001       CODING_ATTR_DECODE_TBL (attrs) = val;
9002     }
9003   else if (EQ (prop, QCencode_translation_table))
9004     {
9005       if (! CHAR_TABLE_P (val) && ! CONSP (val))
9006         CHECK_SYMBOL (val);
9007       CODING_ATTR_ENCODE_TBL (attrs) = val;
9008     }
9009   else if (EQ (prop, QCpost_read_conversion))
9010     {
9011       CHECK_SYMBOL (val);
9012       CODING_ATTR_POST_READ (attrs) = val;
9013     }
9014   else if (EQ (prop, QCpre_write_conversion))
9015     {
9016       CHECK_SYMBOL (val);
9017       CODING_ATTR_PRE_WRITE (attrs) = val;
9018     }
9019   else if (EQ (prop, QCascii_compatible_p))
9020     {
9021       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9022     }
9023
9024   CODING_ATTR_PLIST (attrs)
9025     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9026   return val;
9027 }
9028
9029
9030 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9031        Sdefine_coding_system_alias, 2, 2, 0,
9032        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9033      (alias, coding_system)
9034      Lisp_Object alias, coding_system;
9035 {
9036   Lisp_Object spec, aliases, eol_type, val;
9037
9038   CHECK_SYMBOL (alias);
9039   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9040   aliases = AREF (spec, 1);
9041   /* ALISES should be a list of length more than zero, and the first
9042      element is a base coding system.  Append ALIAS at the tail of the
9043      list.  */
9044   while (!NILP (XCDR (aliases)))
9045     aliases = XCDR (aliases);
9046   XSETCDR (aliases, Fcons (alias, Qnil));
9047
9048   eol_type = AREF (spec, 2);
9049   if (VECTORP (eol_type))
9050     {
9051       Lisp_Object subsidiaries;
9052       int i;
9053
9054       subsidiaries = make_subsidiaries (alias);
9055       for (i = 0; i < 3; i++)
9056         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9057                                      AREF (eol_type, i));
9058     }
9059
9060   Fputhash (alias, spec, Vcoding_system_hash_table);
9061   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9062   val = Fassoc (Fsymbol_name (alias), Vcoding_system_alist);
9063   if (NILP (val))
9064     Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9065                                   Vcoding_system_alist);
9066
9067   return Qnil;
9068 }
9069
9070 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9071        1, 1, 0,
9072        doc: /* Return the base of CODING-SYSTEM.
9073 Any alias or subsidiary coding system is not a base coding system.  */)
9074   (coding_system)
9075      Lisp_Object coding_system;
9076 {
9077   Lisp_Object spec, attrs;
9078
9079   if (NILP (coding_system))
9080     return (Qno_conversion);
9081   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9082   attrs = AREF (spec, 0);
9083   return CODING_ATTR_BASE_NAME (attrs);
9084 }
9085
9086 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9087        1, 1, 0,
9088        doc: "Return the property list of CODING-SYSTEM.")
9089      (coding_system)
9090      Lisp_Object coding_system;
9091 {
9092   Lisp_Object spec, attrs;
9093
9094   if (NILP (coding_system))
9095     coding_system = Qno_conversion;
9096   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9097   attrs = AREF (spec, 0);
9098   return CODING_ATTR_PLIST (attrs);
9099 }
9100
9101
9102 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9103        1, 1, 0,
9104        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9105      (coding_system)
9106      Lisp_Object coding_system;
9107 {
9108   Lisp_Object spec;
9109
9110   if (NILP (coding_system))
9111     coding_system = Qno_conversion;
9112   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9113   return AREF (spec, 1);
9114 }
9115
9116 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9117        Scoding_system_eol_type, 1, 1, 0,
9118        doc: /* Return eol-type of CODING-SYSTEM.
9119 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9120
9121 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9122 and CR respectively.
9123
9124 A vector value indicates that a format of end-of-line should be
9125 detected automatically.  Nth element of the vector is the subsidiary
9126 coding system whose eol-type is N.  */)
9127      (coding_system)
9128      Lisp_Object coding_system;
9129 {
9130   Lisp_Object spec, eol_type;
9131   int n;
9132
9133   if (NILP (coding_system))
9134     coding_system = Qno_conversion;
9135   if (! CODING_SYSTEM_P (coding_system))
9136     return Qnil;
9137   spec = CODING_SYSTEM_SPEC (coding_system);
9138   eol_type = AREF (spec, 2);
9139   if (VECTORP (eol_type))
9140     return Fcopy_sequence (eol_type);
9141   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9142   return make_number (n);
9143 }
9144
9145 #endif /* emacs */
9146
9147 \f
9148 /*** 9. Post-amble ***/
9149
9150 void
9151 init_coding_once ()
9152 {
9153   int i;
9154
9155   for (i = 0; i < coding_category_max; i++)
9156     {
9157       coding_categories[i].id = -1;
9158       coding_priorities[i] = i;
9159     }
9160
9161   /* ISO2022 specific initialize routine.  */
9162   for (i = 0; i < 0x20; i++)
9163     iso_code_class[i] = ISO_control_0;
9164   for (i = 0x21; i < 0x7F; i++)
9165     iso_code_class[i] = ISO_graphic_plane_0;
9166   for (i = 0x80; i < 0xA0; i++)
9167     iso_code_class[i] = ISO_control_1;
9168   for (i = 0xA1; i < 0xFF; i++)
9169     iso_code_class[i] = ISO_graphic_plane_1;
9170   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9171   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9172   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9173   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9174   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9175   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9176   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9177   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9178   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9179
9180   for (i = 0; i < 256; i++)
9181     {
9182       emacs_mule_bytes[i] = 1;
9183     }
9184   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9185   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9186   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9187   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9188 }
9189
9190 #ifdef emacs
9191
9192 void
9193 syms_of_coding ()
9194 {
9195   staticpro (&Vcoding_system_hash_table);
9196   {
9197     Lisp_Object args[2];
9198     args[0] = QCtest;
9199     args[1] = Qeq;
9200     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9201   }
9202
9203   staticpro (&Vsjis_coding_system);
9204   Vsjis_coding_system = Qnil;
9205
9206   staticpro (&Vbig5_coding_system);
9207   Vbig5_coding_system = Qnil;
9208
9209   staticpro (&Vcode_conversion_reused_workbuf);
9210   Vcode_conversion_reused_workbuf = Qnil;
9211
9212   staticpro (&Vcode_conversion_workbuf_name);
9213   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9214
9215   reused_workbuf_in_use = 0;
9216
9217   DEFSYM (Qcharset, "charset");
9218   DEFSYM (Qtarget_idx, "target-idx");
9219   DEFSYM (Qcoding_system_history, "coding-system-history");
9220   Fset (Qcoding_system_history, Qnil);
9221
9222   /* Target FILENAME is the first argument.  */
9223   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9224   /* Target FILENAME is the third argument.  */
9225   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9226
9227   DEFSYM (Qcall_process, "call-process");
9228   /* Target PROGRAM is the first argument.  */
9229   Fput (Qcall_process, Qtarget_idx, make_number (0));
9230
9231   DEFSYM (Qcall_process_region, "call-process-region");
9232   /* Target PROGRAM is the third argument.  */
9233   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9234
9235   DEFSYM (Qstart_process, "start-process");
9236   /* Target PROGRAM is the third argument.  */
9237   Fput (Qstart_process, Qtarget_idx, make_number (2));
9238
9239   DEFSYM (Qopen_network_stream, "open-network-stream");
9240   /* Target SERVICE is the fourth argument.  */
9241   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9242
9243   DEFSYM (Qcoding_system, "coding-system");
9244   DEFSYM (Qcoding_aliases, "coding-aliases");
9245
9246   DEFSYM (Qeol_type, "eol-type");
9247   DEFSYM (Qunix, "unix");
9248   DEFSYM (Qdos, "dos");
9249
9250   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9251   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9252   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9253   DEFSYM (Qdefault_char, "default-char");
9254   DEFSYM (Qundecided, "undecided");
9255   DEFSYM (Qno_conversion, "no-conversion");
9256   DEFSYM (Qraw_text, "raw-text");
9257
9258   DEFSYM (Qiso_2022, "iso-2022");
9259
9260   DEFSYM (Qutf_8, "utf-8");
9261   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9262
9263   DEFSYM (Qutf_16, "utf-16");
9264   DEFSYM (Qbig, "big");
9265   DEFSYM (Qlittle, "little");
9266
9267   DEFSYM (Qshift_jis, "shift-jis");
9268   DEFSYM (Qbig5, "big5");
9269
9270   DEFSYM (Qcoding_system_p, "coding-system-p");
9271
9272   DEFSYM (Qcoding_system_error, "coding-system-error");
9273   Fput (Qcoding_system_error, Qerror_conditions,
9274         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9275   Fput (Qcoding_system_error, Qerror_message,
9276         build_string ("Invalid coding system"));
9277
9278   /* Intern this now in case it isn't already done.
9279      Setting this variable twice is harmless.
9280      But don't staticpro it here--that is done in alloc.c.  */
9281   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9282
9283   DEFSYM (Qtranslation_table, "translation-table");
9284   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9285   DEFSYM (Qtranslation_table_id, "translation-table-id");
9286   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9287   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9288
9289   DEFSYM (Qvalid_codes, "valid-codes");
9290
9291   DEFSYM (Qemacs_mule, "emacs-mule");
9292
9293   DEFSYM (QCcategory, ":category");
9294   DEFSYM (QCmnemonic, ":mnemonic");
9295   DEFSYM (QCdefalut_char, ":default-char");
9296   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9297   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9298   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9299   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9300   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9301
9302   Vcoding_category_table
9303     = Fmake_vector (make_number (coding_category_max), Qnil);
9304   staticpro (&Vcoding_category_table);
9305   /* Followings are target of code detection.  */
9306   ASET (Vcoding_category_table, coding_category_iso_7,
9307         intern ("coding-category-iso-7"));
9308   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9309         intern ("coding-category-iso-7-tight"));
9310   ASET (Vcoding_category_table, coding_category_iso_8_1,
9311         intern ("coding-category-iso-8-1"));
9312   ASET (Vcoding_category_table, coding_category_iso_8_2,
9313         intern ("coding-category-iso-8-2"));
9314   ASET (Vcoding_category_table, coding_category_iso_7_else,
9315         intern ("coding-category-iso-7-else"));
9316   ASET (Vcoding_category_table, coding_category_iso_8_else,
9317         intern ("coding-category-iso-8-else"));
9318   ASET (Vcoding_category_table, coding_category_utf_8,
9319         intern ("coding-category-utf-8"));
9320   ASET (Vcoding_category_table, coding_category_utf_16_be,
9321         intern ("coding-category-utf-16-be"));
9322   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9323         intern ("coding-category-utf-16-auto"));
9324   ASET (Vcoding_category_table, coding_category_utf_16_le,
9325         intern ("coding-category-utf-16-le"));
9326   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9327         intern ("coding-category-utf-16-be-nosig"));
9328   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9329         intern ("coding-category-utf-16-le-nosig"));
9330   ASET (Vcoding_category_table, coding_category_charset,
9331         intern ("coding-category-charset"));
9332   ASET (Vcoding_category_table, coding_category_sjis,
9333         intern ("coding-category-sjis"));
9334   ASET (Vcoding_category_table, coding_category_big5,
9335         intern ("coding-category-big5"));
9336   ASET (Vcoding_category_table, coding_category_ccl,
9337         intern ("coding-category-ccl"));
9338   ASET (Vcoding_category_table, coding_category_emacs_mule,
9339         intern ("coding-category-emacs-mule"));
9340   /* Followings are NOT target of code detection.  */
9341   ASET (Vcoding_category_table, coding_category_raw_text,
9342         intern ("coding-category-raw-text"));
9343   ASET (Vcoding_category_table, coding_category_undecided,
9344         intern ("coding-category-undecided"));
9345
9346   DEFSYM (Qinsufficient_source, "insufficient-source");
9347   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9348   DEFSYM (Qinvalid_source, "invalid-source");
9349   DEFSYM (Qinterrupted, "interrupted");
9350   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9351   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9352
9353   defsubr (&Scoding_system_p);
9354   defsubr (&Sread_coding_system);
9355   defsubr (&Sread_non_nil_coding_system);
9356   defsubr (&Scheck_coding_system);
9357   defsubr (&Sdetect_coding_region);
9358   defsubr (&Sdetect_coding_string);
9359   defsubr (&Sfind_coding_systems_region_internal);
9360   defsubr (&Sunencodable_char_position);
9361   defsubr (&Scheck_coding_systems_region);
9362   defsubr (&Sdecode_coding_region);
9363   defsubr (&Sencode_coding_region);
9364   defsubr (&Sdecode_coding_string);
9365   defsubr (&Sencode_coding_string);
9366   defsubr (&Sdecode_sjis_char);
9367   defsubr (&Sencode_sjis_char);
9368   defsubr (&Sdecode_big5_char);
9369   defsubr (&Sencode_big5_char);
9370   defsubr (&Sset_terminal_coding_system_internal);
9371   defsubr (&Sset_safe_terminal_coding_system_internal);
9372   defsubr (&Sterminal_coding_system);
9373   defsubr (&Sset_keyboard_coding_system_internal);
9374   defsubr (&Skeyboard_coding_system);
9375   defsubr (&Sfind_operation_coding_system);
9376   defsubr (&Sset_coding_system_priority);
9377   defsubr (&Sdefine_coding_system_internal);
9378   defsubr (&Sdefine_coding_system_alias);
9379   defsubr (&Scoding_system_put);
9380   defsubr (&Scoding_system_base);
9381   defsubr (&Scoding_system_plist);
9382   defsubr (&Scoding_system_aliases);
9383   defsubr (&Scoding_system_eol_type);
9384   defsubr (&Scoding_system_priority_list);
9385
9386   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9387                doc: /* List of coding systems.
9388
9389 Do not alter the value of this variable manually.  This variable should be
9390 updated by the functions `define-coding-system' and
9391 `define-coding-system-alias'.  */);
9392   Vcoding_system_list = Qnil;
9393
9394   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9395                doc: /* Alist of coding system names.
9396 Each element is one element list of coding system name.
9397 This variable is given to `completing-read' as TABLE argument.
9398
9399 Do not alter the value of this variable manually.  This variable should be
9400 updated by the functions `make-coding-system' and
9401 `define-coding-system-alias'.  */);
9402   Vcoding_system_alist = Qnil;
9403
9404   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9405                doc: /* List of coding-categories (symbols) ordered by priority.
9406
9407 On detecting a coding system, Emacs tries code detection algorithms
9408 associated with each coding-category one by one in this order.  When
9409 one algorithm agrees with a byte sequence of source text, the coding
9410 system bound to the corresponding coding-category is selected.
9411
9412 Don't modify this variable directly, but use `set-coding-priority'.  */);
9413   {
9414     int i;
9415
9416     Vcoding_category_list = Qnil;
9417     for (i = coding_category_max - 1; i >= 0; i--)
9418       Vcoding_category_list
9419         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9420                  Vcoding_category_list);
9421   }
9422
9423   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9424                doc: /* Specify the coding system for read operations.
9425 It is useful to bind this variable with `let', but do not set it globally.
9426 If the value is a coding system, it is used for decoding on read operation.
9427 If not, an appropriate element is used from one of the coding system alists:
9428 There are three such tables, `file-coding-system-alist',
9429 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9430   Vcoding_system_for_read = Qnil;
9431
9432   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9433                doc: /* Specify the coding system for write operations.
9434 Programs bind this variable with `let', but you should not set it globally.
9435 If the value is a coding system, it is used for encoding of output,
9436 when writing it to a file and when sending it to a file or subprocess.
9437
9438 If this does not specify a coding system, an appropriate element
9439 is used from one of the coding system alists:
9440 There are three such tables, `file-coding-system-alist',
9441 `process-coding-system-alist', and `network-coding-system-alist'.
9442 For output to files, if the above procedure does not specify a coding system,
9443 the value of `buffer-file-coding-system' is used.  */);
9444   Vcoding_system_for_write = Qnil;
9445
9446   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9447                doc: /*
9448 Coding system used in the latest file or process I/O.  */);
9449   Vlast_coding_system_used = Qnil;
9450
9451   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9452                doc: /*
9453 Error status of the last code conversion.
9454
9455 When an error was detected in the last code conversion, this variable
9456 is set to one of the following symbols.
9457   `insufficient-source'
9458   `inconsistent-eol'
9459   `invalid-source'
9460   `interrupted'
9461   `insufficient-memory'
9462 When no error was detected, the value doesn't change.  So, to check
9463 the error status of a code conversion by this variable, you must
9464 explicitly set this variable to nil before performing code
9465 conversion.  */);
9466   Vlast_code_conversion_error = Qnil;
9467
9468   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9469                doc: /*
9470 *Non-nil means always inhibit code conversion of end-of-line format.
9471 See info node `Coding Systems' and info node `Text and Binary' concerning
9472 such conversion.  */);
9473   inhibit_eol_conversion = 0;
9474
9475   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9476                doc: /*
9477 Non-nil means process buffer inherits coding system of process output.
9478 Bind it to t if the process output is to be treated as if it were a file
9479 read from some filesystem.  */);
9480   inherit_process_coding_system = 0;
9481
9482   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9483                doc: /*
9484 Alist to decide a coding system to use for a file I/O operation.
9485 The format is ((PATTERN . VAL) ...),
9486 where PATTERN is a regular expression matching a file name,
9487 VAL is a coding system, a cons of coding systems, or a function symbol.
9488 If VAL is a coding system, it is used for both decoding and encoding
9489 the file contents.
9490 If VAL is a cons of coding systems, the car part is used for decoding,
9491 and the cdr part is used for encoding.
9492 If VAL is a function symbol, the function must return a coding system
9493 or a cons of coding systems which are used as above.  The function gets
9494 the arguments with which `find-operation-coding-systems' was called.
9495
9496 See also the function `find-operation-coding-system'
9497 and the variable `auto-coding-alist'.  */);
9498   Vfile_coding_system_alist = Qnil;
9499
9500   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9501                doc: /*
9502 Alist to decide a coding system to use for a process I/O operation.
9503 The format is ((PATTERN . VAL) ...),
9504 where PATTERN is a regular expression matching a program name,
9505 VAL is a coding system, a cons of coding systems, or a function symbol.
9506 If VAL is a coding system, it is used for both decoding what received
9507 from the program and encoding what sent to the program.
9508 If VAL is a cons of coding systems, the car part is used for decoding,
9509 and the cdr part is used for encoding.
9510 If VAL is a function symbol, the function must return a coding system
9511 or a cons of coding systems which are used as above.
9512
9513 See also the function `find-operation-coding-system'.  */);
9514   Vprocess_coding_system_alist = Qnil;
9515
9516   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9517                doc: /*
9518 Alist to decide a coding system to use for a network I/O operation.
9519 The format is ((PATTERN . VAL) ...),
9520 where PATTERN is a regular expression matching a network service name
9521 or is a port number to connect to,
9522 VAL is a coding system, a cons of coding systems, or a function symbol.
9523 If VAL is a coding system, it is used for both decoding what received
9524 from the network stream and encoding what sent to the network stream.
9525 If VAL is a cons of coding systems, the car part is used for decoding,
9526 and the cdr part is used for encoding.
9527 If VAL is a function symbol, the function must return a coding system
9528 or a cons of coding systems which are used as above.
9529
9530 See also the function `find-operation-coding-system'.  */);
9531   Vnetwork_coding_system_alist = Qnil;
9532
9533   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9534                doc: /* Coding system to use with system messages.
9535 Also used for decoding keyboard input on X Window system.  */);
9536   Vlocale_coding_system = Qnil;
9537
9538   /* The eol mnemonics are reset in startup.el system-dependently.  */
9539   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9540                doc: /*
9541 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9542   eol_mnemonic_unix = build_string (":");
9543
9544   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9545                doc: /*
9546 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9547   eol_mnemonic_dos = build_string ("\\");
9548
9549   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9550                doc: /*
9551 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9552   eol_mnemonic_mac = build_string ("/");
9553
9554   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9555                doc: /*
9556 *String displayed in mode line when end-of-line format is not yet determined.  */);
9557   eol_mnemonic_undecided = build_string (":");
9558
9559   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9560                doc: /*
9561 *Non-nil enables character translation while encoding and decoding.  */);
9562   Venable_character_translation = Qt;
9563
9564   DEFVAR_LISP ("standard-translation-table-for-decode",
9565                &Vstandard_translation_table_for_decode,
9566                doc: /* Table for translating characters while decoding.  */);
9567   Vstandard_translation_table_for_decode = Qnil;
9568
9569   DEFVAR_LISP ("standard-translation-table-for-encode",
9570                &Vstandard_translation_table_for_encode,
9571                doc: /* Table for translating characters while encoding.  */);
9572   Vstandard_translation_table_for_encode = Qnil;
9573
9574   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9575                doc: /* Alist of charsets vs revision numbers.
9576 While encoding, if a charset (car part of an element) is found,
9577 designate it with the escape sequence identifying revision (cdr part
9578 of the element).  */);
9579   Vcharset_revision_table = Qnil;
9580
9581   DEFVAR_LISP ("default-process-coding-system",
9582                &Vdefault_process_coding_system,
9583                doc: /* Cons of coding systems used for process I/O by default.
9584 The car part is used for decoding a process output,
9585 the cdr part is used for encoding a text to be sent to a process.  */);
9586   Vdefault_process_coding_system = Qnil;
9587
9588   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9589                doc: /*
9590 Table of extra Latin codes in the range 128..159 (inclusive).
9591 This is a vector of length 256.
9592 If Nth element is non-nil, the existence of code N in a file
9593 \(or output of subprocess) doesn't prevent it to be detected as
9594 a coding system of ISO 2022 variant which has a flag
9595 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9596 or reading output of a subprocess.
9597 Only 128th through 159th elements has a meaning.  */);
9598   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9599
9600   DEFVAR_LISP ("select-safe-coding-system-function",
9601                &Vselect_safe_coding_system_function,
9602                doc: /*
9603 Function to call to select safe coding system for encoding a text.
9604
9605 If set, this function is called to force a user to select a proper
9606 coding system which can encode the text in the case that a default
9607 coding system used in each operation can't encode the text.
9608
9609 The default value is `select-safe-coding-system' (which see).  */);
9610   Vselect_safe_coding_system_function = Qnil;
9611
9612   DEFVAR_BOOL ("coding-system-require-warning",
9613                &coding_system_require_warning,
9614                doc: /* Internal use only.
9615 If non-nil, on writing a file, `select-safe-coding-system-function' is
9616 called even if `coding-system-for-write' is non-nil.  The command
9617 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9618   coding_system_require_warning = 0;
9619
9620
9621   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9622                &inhibit_iso_escape_detection,
9623                doc: /*
9624 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9625
9626 By default, on reading a file, Emacs tries to detect how the text is
9627 encoded.  This code detection is sensitive to escape sequences.  If
9628 the sequence is valid as ISO2022, the code is determined as one of
9629 the ISO2022 encodings, and the file is decoded by the corresponding
9630 coding system (e.g. `iso-2022-7bit').
9631
9632 However, there may be a case that you want to read escape sequences in
9633 a file as is.  In such a case, you can set this variable to non-nil.
9634 Then, as the code detection ignores any escape sequences, no file is
9635 detected as encoded in some ISO2022 encoding.  The result is that all
9636 escape sequences become visible in a buffer.
9637
9638 The default value is nil, and it is strongly recommended not to change
9639 it.  That is because many Emacs Lisp source files that contain
9640 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9641 in Emacs's distribution, and they won't be decoded correctly on
9642 reading if you suppress escape sequence detection.
9643
9644 The other way to read escape sequences in a file without decoding is
9645 to explicitly specify some coding system that doesn't use ISO2022's
9646 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9647   inhibit_iso_escape_detection = 0;
9648
9649   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9650                doc: /* Char table for translating self-inserting characters.
9651 This is applied to the result of input methods, not their input.  See also
9652 `keyboard-translate-table'.  */);
9653     Vtranslation_table_for_input = Qnil;
9654
9655   {
9656     Lisp_Object args[coding_arg_max];
9657     Lisp_Object plist[16];
9658     int i;
9659
9660     for (i = 0; i < coding_arg_max; i++)
9661       args[i] = Qnil;
9662
9663     plist[0] = intern (":name");
9664     plist[1] = args[coding_arg_name] = Qno_conversion;
9665     plist[2] = intern (":mnemonic");
9666     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9667     plist[4] = intern (":coding-type");
9668     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9669     plist[6] = intern (":ascii-compatible-p");
9670     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9671     plist[8] = intern (":default-char");
9672     plist[9] = args[coding_arg_default_char] = make_number (0);
9673     plist[10] = intern (":for-unibyte");
9674     plist[11] = args[coding_arg_for_unibyte] = Qt;
9675     plist[12] = intern (":docstring");
9676     plist[13] = build_string ("Do no conversion.\n\
9677 \n\
9678 When you visit a file with this coding, the file is read into a\n\
9679 unibyte buffer as is, thus each byte of a file is treated as a\n\
9680 character.");
9681     plist[14] = intern (":eol-type");
9682     plist[15] = args[coding_arg_eol_type] = Qunix;
9683     args[coding_arg_plist] = Flist (16, plist);
9684     Fdefine_coding_system_internal (coding_arg_max, args);
9685
9686     plist[1] = args[coding_arg_name] = Qundecided;
9687     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9688     plist[5] = args[coding_arg_coding_type] = Qundecided;
9689     /* This is already set.
9690        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9691     plist[8] = intern (":charset-list");
9692     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9693     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9694     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9695     plist[15] = args[coding_arg_eol_type] = Qnil;
9696     args[coding_arg_plist] = Flist (16, plist);
9697     Fdefine_coding_system_internal (coding_arg_max, args);
9698   }
9699
9700   setup_coding_system (Qno_conversion, &keyboard_coding);
9701   setup_coding_system (Qundecided, &terminal_coding);
9702   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9703
9704   {
9705     int i;
9706
9707     for (i = 0; i < coding_category_max; i++)
9708       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9709   }
9710 }
9711
9712 char *
9713 emacs_strerror (error_number)
9714      int error_number;
9715 {
9716   char *str;
9717
9718   synchronize_system_messages_locale ();
9719   str = strerror (error_number);
9720
9721   if (! NILP (Vlocale_coding_system))
9722     {
9723       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9724                                                       Vlocale_coding_system,
9725                                                       0);
9726       str = (char *) SDATA (dec);
9727     }
9728
9729   return str;
9730 }
9731
9732 #endif /* emacs */
9733
9734 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9735    (do not change this comment) */