src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 2001, 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
   3    Copyright (C) 1995, 1997, 1998, 2002, 2003, 2004, 2005
   4      National Institute of Advanced Industrial Science and Technology (AIST)
   5      Registration Number H14PRO021
   6    Copyright (C) 2003
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 2, or (at your option)
  15 any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs; see the file COPYING.  If not, write to
  24 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  25 Boston, MA 02110-1301, USA.  */
  26
  27 /*** TABLE OF CONTENTS ***
  28
  29   0. General comments
  30   1. Preamble
  31   2. Emacs' internal format (emacs-utf-8) handlers
  32   3. UTF-8 handlers
  33   4. UTF-16 handlers
  34   5. Charset-base coding systems handlers
  35   6. emacs-mule (old Emacs' internal format) handlers
  36   7. ISO2022 handlers
  37   8. Shift-JIS and BIG5 handlers
  38   9. CCL handlers
  39   10. C library functions
  40   11. Emacs Lisp library functions
  41   12. Postamble
  42
  43 */
  44
  45 /*** 0. General comments ***
  46
  47
  48 CODING SYSTEM
  49
  50   A coding system is an object for an encoding mechanism that contains
  51   information about how to convert byte sequences to character
  52   sequences and vice versa.  When we say "decode", it means converting
  53   a byte sequence of a specific coding system into a character
  54   sequence that is represented by Emacs' internal coding system
  55   `emacs-utf-8', and when we say "encode", it means converting a
  56   character sequence of emacs-utf-8 to a byte sequence of a specific
  57   coding system.
  58
  59   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  60   C level, a coding system is represented by a vector of attributes
  61   stored in the hash table Vcharset_hash_table.  The conversion from
  62   coding system symbol to attributes vector is done by looking up
  63   Vcharset_hash_table by the symbol.
  64
  65   Coding systems are classified into the following types depending on
  66   the encoding mechanism.  Here's a brief description of the types.
  67
  68   o UTF-8
  69
  70   o UTF-16
  71
  72   o Charset-base coding system
  73
  74   A coding system defined by one or more (coded) character sets.
  75   Decoding and encoding are done by a code converter defined for each
  76   character set.
  77
  78   o Old Emacs internal format (emacs-mule)
  79
  80   The coding system adopted by old versions of Emacs (20 and 21).
  81
  82   o ISO2022-base coding system
  83
  84   The most famous coding system for multiple character sets.  X's
  85   Compound Text, various EUCs (Extended Unix Code), and coding systems
  86   used in the Internet communication such as ISO-2022-JP are all
  87   variants of ISO2022.
  88
  89   o SJIS (or Shift-JIS or MS-Kanji-Code)
  90
  91   A coding system to encode character sets: ASCII, JISX0201, and
  92   JISX0208.  Widely used for PC's in Japan.  Details are described in
  93   section 8.
  94
  95   o BIG5
  96
  97   A coding system to encode character sets: ASCII and Big5.  Widely
  98   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  99   described in section 8.  In this file, when we write "big5" (all
 100   lowercase), we mean the coding system, and when we write "Big5"
 101   (capitalized), we mean the character set.
 102
 103   o CCL
 104
 105   If a user wants to decode/encode text encoded in a coding system
 106   not listed above, he can supply a decoder and an encoder for it in
 107   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 108   program while decoding/encoding.
 109
 110   o Raw-text
 111
 112   A coding system for text containing raw eight-bit data.  Emacs
 113   treats each byte of source text as a character (except for
 114   end-of-line conversion).
 115
 116   o No-conversion
 117
 118   Like raw text, but don't do end-of-line conversion.
 119
 120
 121 END-OF-LINE FORMAT
 122
 123   How text end-of-line is encoded depends on operating system.  For
 124   instance, Unix's format is just one byte of LF (line-feed) code,
 125   whereas DOS's format is two-byte sequence of `carriage-return' and
 126   `line-feed' codes.  MacOS's format is usually one byte of
 127   `carriage-return'.
 128
 129   Since text character encoding and end-of-line encoding are
 130   independent, any coding system described above can take any format
 131   of end-of-line (except for no-conversion).
 132
 133 STRUCT CODING_SYSTEM
 134
 135   Before using a coding system for code conversion (i.e. decoding and
 136   encoding), we setup a structure of type `struct coding_system'.
 137   This structure keeps various information about a specific code
 138   conversion (e.g. the location of source and destination data).
 139
 140 */
 141
 142 /* COMMON MACROS */
 143
 144
 145 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 146
 147   These functions check if a byte sequence specified as a source in
 148   CODING conforms to the format of XXX, and update the members of
 149   DETECT_INFO.
 150
 151   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 152
 153   Below is the template of these functions.  */
 154
 155 #if 0
 156 static int
 157 detect_coding_XXX (coding, detect_info)
 158      struct coding_system *coding;
 159      struct coding_detection_info *detect_info;
 160 {
 161   const unsigned char *src = coding->source;
 162   const unsigned char *src_end = coding->source + coding->src_bytes;
 163   int multibytep = coding->src_multibyte;
 164   int consumed_chars = 0;
 165   int found = 0;
 166   ...;
 167
 168   while (1)
 169     {
 170       /* Get one byte from the source.  If the souce is exausted, jump
 171          to no_more_source:.  */
 172       ONE_MORE_BYTE (c);
 173
 174       if (! __C_conforms_to_XXX___ (c))
 175         break;
 176       if (! __C_strongly_suggests_XXX__ (c))
 177         found = CATEGORY_MASK_XXX;
 178     }
 179   /* The byte sequence is invalid for XXX.  */
 180   detect_info->rejected |= CATEGORY_MASK_XXX;
 181   return 0;
 182
 183  no_more_source:
 184   /* The source exausted successfully.  */
 185   detect_info->found |= found;
 186   return 1;
 187 }
 188 #endif
 189
 190 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 191
 192   These functions decode a byte sequence specified as a source by
 193   CODING.  The resulting multibyte text goes to a place pointed to by
 194   CODING->charbuf, the length of which should not exceed
 195   CODING->charbuf_size;
 196
 197   These functions set the information of original and decoded texts in
 198   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 199   They also set CODING->result to one of CODING_RESULT_XXX indicating
 200   how the decoding is finished.
 201
 202   Below is the template of these functions.  */
 203
 204 #if 0
 205 static void
 206 decode_coding_XXXX (coding)
 207      struct coding_system *coding;
 208 {
 209   const unsigned char *src = coding->source + coding->consumed;
 210   const unsigned char *src_end = coding->source + coding->src_bytes;
 211   /* SRC_BASE remembers the start position in source in each loop.
 212      The loop will be exited when there's not enough source code, or
 213      when there's no room in CHARBUF for a decoded character.  */
 214   const unsigned char *src_base;
 215   /* A buffer to produce decoded characters.  */
 216   int *charbuf = coding->charbuf + coding->charbuf_used;
 217   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 218   int multibytep = coding->src_multibyte;
 219
 220   while (1)
 221     {
 222       src_base = src;
 223       if (charbuf < charbuf_end)
 224         /* No more room to produce a decoded character.  */
 225         break;
 226       ONE_MORE_BYTE (c);
 227       /* Decode it. */
 228     }
 229
 230  no_more_source:
 231   if (src_base < src_end
 232       && coding->mode & CODING_MODE_LAST_BLOCK)
 233     /* If the source ends by partial bytes to construct a character,
 234        treat them as eight-bit raw data.  */
 235     while (src_base < src_end && charbuf < charbuf_end)
 236       *charbuf++ = *src_base++;
 237   /* Remember how many bytes and characters we consumed.  If the
 238      source is multibyte, the bytes and chars are not identical.  */
 239   coding->consumed = coding->consumed_char = src_base - coding->source;
 240   /* Remember how many characters we produced.  */
 241   coding->charbuf_used = charbuf - coding->charbuf;
 242 }
 243 #endif
 244
 245 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 246
 247   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 248   internal multibyte format by CODING.  The resulting byte sequence
 249   goes to a place pointed to by DESTINATION, the length of which
 250   should not exceed DST_BYTES.
 251
 252   These functions set the information of original and encoded texts in
 253   the members produced, produced_char, consumed, and consumed_char of
 254   the structure *CODING.  They also set the member result to one of
 255   CODING_RESULT_XXX indicating how the encoding finished.
 256
 257   DST_BYTES zero means that source area and destination area are
 258   overlapped, which means that we can produce a encoded text until it
 259   reaches at the head of not-yet-encoded source text.
 260
 261   Below is a template of these functions.  */
 262 #if 0
 263 static void
 264 encode_coding_XXX (coding)
 265      struct coding_system *coding;
 266 {
 267   int multibytep = coding->dst_multibyte;
 268   int *charbuf = coding->charbuf;
 269   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 270   unsigned char *dst = coding->destination + coding->produced;
 271   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 272   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 273   int produced_chars = 0;
 274
 275   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 276     {
 277       int c = *charbuf;
 278       /* Encode C into DST, and increment DST.  */
 279     }
 280  label_no_more_destination:
 281   /* How many chars and bytes we produced.  */
 282   coding->produced_char += produced_chars;
 283   coding->produced = dst - coding->destination;
 284 }
 285 #endif
 286
 287 \f
 288 /*** 1. Preamble ***/
 289
 290 #include <config.h>
 291 #include <stdio.h>
 292
 293 #include "lisp.h"
 294 #include "buffer.h"
 295 #include "character.h"
 296 #include "charset.h"
 297 #include "ccl.h"
 298 #include "composite.h"
 299 #include "coding.h"
 300 #include "window.h"
 301
 302 Lisp_Object Vcoding_system_hash_table;
 303
 304 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 305 Lisp_Object Qunix, Qdos;
 306 extern Lisp_Object Qmac;        /* frame.c */
 307 Lisp_Object Qbuffer_file_coding_system;
 308 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 309 Lisp_Object Qdefault_char;
 310 Lisp_Object Qno_conversion, Qundecided;
 311 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 312 Lisp_Object Qbig, Qlittle;
 313 Lisp_Object Qcoding_system_history;
 314 Lisp_Object Qvalid_codes;
 315 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 316 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 317 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 318 Lisp_Object QCascii_compatible_p;
 319
 320 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 321 Lisp_Object Qcall_process, Qcall_process_region;
 322 Lisp_Object Qstart_process, Qopen_network_stream;
 323 Lisp_Object Qtarget_idx;
 324
 325 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 326 Lisp_Object Qinterrupted, Qinsufficient_memory;
 327
 328 /* If a symbol has this property, evaluate the value to define the
 329    symbol as a coding system.  */
 330 static Lisp_Object Qcoding_system_define_form;
 331
 332 int coding_system_require_warning;
 333
 334 Lisp_Object Vselect_safe_coding_system_function;
 335
 336 /* Mnemonic string for each format of end-of-line.  */
 337 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 338 /* Mnemonic string to indicate format of end-of-line is not yet
 339    decided.  */
 340 Lisp_Object eol_mnemonic_undecided;
 341
 342 #ifdef emacs
 343
 344 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 345
 346 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 347
 348 /* Coding system emacs-mule and raw-text are for converting only
 349    end-of-line format.  */
 350 Lisp_Object Qemacs_mule, Qraw_text;
 351 Lisp_Object Qutf_8_emacs;
 352
 353 /* Coding-systems are handed between Emacs Lisp programs and C internal
 354    routines by the following three variables.  */
 355 /* Coding-system for reading files and receiving data from process.  */
 356 Lisp_Object Vcoding_system_for_read;
 357 /* Coding-system for writing files and sending data to process.  */
 358 Lisp_Object Vcoding_system_for_write;
 359 /* Coding-system actually used in the latest I/O.  */
 360 Lisp_Object Vlast_coding_system_used;
 361 /* Set to non-nil when an error is detected while code conversion.  */
 362 Lisp_Object Vlast_code_conversion_error;
 363 /* A vector of length 256 which contains information about special
 364    Latin codes (especially for dealing with Microsoft codes).  */
 365 Lisp_Object Vlatin_extra_code_table;
 366
 367 /* Flag to inhibit code conversion of end-of-line format.  */
 368 int inhibit_eol_conversion;
 369
 370 /* Flag to inhibit ISO2022 escape sequence detection.  */
 371 int inhibit_iso_escape_detection;
 372
 373 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 374 int inherit_process_coding_system;
 375
 376 /* Coding system to be used to encode text for terminal display.  */
 377 struct coding_system terminal_coding;
 378
 379 /* Coding system to be used to encode text for terminal display when
 380    terminal coding system is nil.  */
 381 struct coding_system safe_terminal_coding;
 382
 383 /* Coding system of what is sent from terminal keyboard.  */
 384 struct coding_system keyboard_coding;
 385
 386 Lisp_Object Vfile_coding_system_alist;
 387 Lisp_Object Vprocess_coding_system_alist;
 388 Lisp_Object Vnetwork_coding_system_alist;
 389
 390 Lisp_Object Vlocale_coding_system;
 391
 392 #endif /* emacs */
 393
 394 /* Flag to tell if we look up translation table on character code
 395    conversion.  */
 396 Lisp_Object Venable_character_translation;
 397 /* Standard translation table to look up on decoding (reading).  */
 398 Lisp_Object Vstandard_translation_table_for_decode;
 399 /* Standard translation table to look up on encoding (writing).  */
 400 Lisp_Object Vstandard_translation_table_for_encode;
 401
 402 Lisp_Object Qtranslation_table;
 403 Lisp_Object Qtranslation_table_id;
 404 Lisp_Object Qtranslation_table_for_decode;
 405 Lisp_Object Qtranslation_table_for_encode;
 406
 407 /* Alist of charsets vs revision number.  */
 408 static Lisp_Object Vcharset_revision_table;
 409
 410 /* Default coding systems used for process I/O.  */
 411 Lisp_Object Vdefault_process_coding_system;
 412
 413 /* Char table for translating Quail and self-inserting input.  */
 414 Lisp_Object Vtranslation_table_for_input;
 415
 416 /* Two special coding systems.  */
 417 Lisp_Object Vsjis_coding_system;
 418 Lisp_Object Vbig5_coding_system;
 419
 420 /* ISO2022 section */
 421
 422 #define CODING_ISO_INITIAL(coding, reg)                 \
 423   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 424                      coding_attr_iso_initial),          \
 425                reg)))
 426
 427
 428 #define CODING_ISO_REQUEST(coding, charset_id)  \
 429   ((charset_id <= (coding)->max_charset_id      \
 430     ? (coding)->safe_charsets[charset_id]       \
 431     : -1))
 432
 433
 434 #define CODING_ISO_FLAGS(coding)        \
 435   ((coding)->spec.iso_2022.flags)
 436 #define CODING_ISO_DESIGNATION(coding, reg)     \
 437   ((coding)->spec.iso_2022.current_designation[reg])
 438 #define CODING_ISO_INVOCATION(coding, plane)    \
 439   ((coding)->spec.iso_2022.current_invocation[plane])
 440 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 441   ((coding)->spec.iso_2022.single_shifting)
 442 #define CODING_ISO_BOL(coding)  \
 443   ((coding)->spec.iso_2022.bol)
 444 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 445   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 446
 447 /* Control characters of ISO2022.  */
 448                         /* code */      /* function */
 449 #define ISO_CODE_LF     0x0A            /* line-feed */
 450 #define ISO_CODE_CR     0x0D            /* carriage-return */
 451 #define ISO_CODE_SO     0x0E            /* shift-out */
 452 #define ISO_CODE_SI     0x0F            /* shift-in */
 453 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 454 #define ISO_CODE_ESC    0x1B            /* escape */
 455 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 456 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 457 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 458
 459 /* All code (1-byte) of ISO2022 is classified into one of the
 460    followings.  */
 461 enum iso_code_class_type
 462   {
 463     ISO_control_0,              /* Control codes in the range
 464                                    0x00..0x1F and 0x7F, except for the
 465                                    following 5 codes.  */
 466     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 467     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 468     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 469     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 470     ISO_control_1,              /* Control codes in the range
 471                                    0x80..0x9F, except for the
 472                                    following 3 codes.  */
 473     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 474     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 475     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 476     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 477     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 478     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 479     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 480   };
 481
 482 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 483     `iso-flags' attribute of an iso2022 coding system.  */
 484
 485 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 486    instead of the correct short-form sequence (e.g. ESC $ A).  */
 487 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 488
 489 /* If set, reset graphic planes and registers at end-of-line to the
 490    initial state.  */
 491 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 492
 493 /* If set, reset graphic planes and registers before any control
 494    characters to the initial state.  */
 495 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 496
 497 /* If set, encode by 7-bit environment.  */
 498 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 499
 500 /* If set, use locking-shift function.  */
 501 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 502
 503 /* If set, use single-shift function.  Overwrite
 504    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 505 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 506
 507 /* If set, use designation escape sequence.  */
 508 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 509
 510 /* If set, produce revision number sequence.  */
 511 #define CODING_ISO_FLAG_REVISION        0x0080
 512
 513 /* If set, produce ISO6429's direction specifying sequence.  */
 514 #define CODING_ISO_FLAG_DIRECTION       0x0100
 515
 516 /* If set, assume designation states are reset at beginning of line on
 517    output.  */
 518 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 519
 520 /* If set, designation sequence should be placed at beginning of line
 521    on output.  */
 522 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 523
 524 /* If set, do not encode unsafe charactes on output.  */
 525 #define CODING_ISO_FLAG_SAFE            0x0800
 526
 527 /* If set, extra latin codes (128..159) are accepted as a valid code
 528    on input.  */
 529 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 530
 531 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 532
 533 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 534
 535 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 536
 537 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 538
 539 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 540
 541 /* A character to be produced on output if encoding of the original
 542    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 543 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 544
 545
 546 /* UTF-16 section */
 547 #define CODING_UTF_16_BOM(coding)       \
 548   ((coding)->spec.utf_16.bom)
 549
 550 #define CODING_UTF_16_ENDIAN(coding)    \
 551   ((coding)->spec.utf_16.endian)
 552
 553 #define CODING_UTF_16_SURROGATE(coding) \
 554   ((coding)->spec.utf_16.surrogate)
 555
 556
 557 /* CCL section */
 558 #define CODING_CCL_DECODER(coding)      \
 559   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 560 #define CODING_CCL_ENCODER(coding)      \
 561   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 562 #define CODING_CCL_VALIDS(coding)                                          \
 563   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 564
 565 /* Index for each coding category in `coding_categories' */
 566
 567 enum coding_category
 568   {
 569     coding_category_iso_7,
 570     coding_category_iso_7_tight,
 571     coding_category_iso_8_1,
 572     coding_category_iso_8_2,
 573     coding_category_iso_7_else,
 574     coding_category_iso_8_else,
 575     coding_category_utf_8,
 576     coding_category_utf_16_auto,
 577     coding_category_utf_16_be,
 578     coding_category_utf_16_le,
 579     coding_category_utf_16_be_nosig,
 580     coding_category_utf_16_le_nosig,
 581     coding_category_charset,
 582     coding_category_sjis,
 583     coding_category_big5,
 584     coding_category_ccl,
 585     coding_category_emacs_mule,
 586     /* All above are targets of code detection.  */
 587     coding_category_raw_text,
 588     coding_category_undecided,
 589     coding_category_max
 590   };
 591
 592 /* Definitions of flag bits used in detect_coding_XXXX.  */
 593 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 594 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 595 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 596 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 597 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 598 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 599 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 600 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 601 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 602 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 603 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 604 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 605 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 606 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 607 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 608 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 609 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 610 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 611
 612 /* This value is returned if detect_coding_mask () find nothing other
 613    than ASCII characters.  */
 614 #define CATEGORY_MASK_ANY               \
 615   (CATEGORY_MASK_ISO_7                  \
 616    | CATEGORY_MASK_ISO_7_TIGHT          \
 617    | CATEGORY_MASK_ISO_8_1              \
 618    | CATEGORY_MASK_ISO_8_2              \
 619    | CATEGORY_MASK_ISO_7_ELSE           \
 620    | CATEGORY_MASK_ISO_8_ELSE           \
 621    | CATEGORY_MASK_UTF_8                \
 622    | CATEGORY_MASK_UTF_16_BE            \
 623    | CATEGORY_MASK_UTF_16_LE            \
 624    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 625    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 626    | CATEGORY_MASK_CHARSET              \
 627    | CATEGORY_MASK_SJIS                 \
 628    | CATEGORY_MASK_BIG5                 \
 629    | CATEGORY_MASK_CCL                  \
 630    | CATEGORY_MASK_EMACS_MULE)
 631
 632
 633 #define CATEGORY_MASK_ISO_7BIT \
 634   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 635
 636 #define CATEGORY_MASK_ISO_8BIT \
 637   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 638
 639 #define CATEGORY_MASK_ISO_ELSE \
 640   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 641
 642 #define CATEGORY_MASK_ISO_ESCAPE        \
 643   (CATEGORY_MASK_ISO_7                  \
 644    | CATEGORY_MASK_ISO_7_TIGHT          \
 645    | CATEGORY_MASK_ISO_7_ELSE           \
 646    | CATEGORY_MASK_ISO_8_ELSE)
 647
 648 #define CATEGORY_MASK_ISO       \
 649   (  CATEGORY_MASK_ISO_7BIT     \
 650      | CATEGORY_MASK_ISO_8BIT   \
 651      | CATEGORY_MASK_ISO_ELSE)
 652
 653 #define CATEGORY_MASK_UTF_16            \
 654   (CATEGORY_MASK_UTF_16_BE              \
 655    | CATEGORY_MASK_UTF_16_LE            \
 656    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 657    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 658
 659
 660 /* List of symbols `coding-category-xxx' ordered by priority.  This
 661    variable is exposed to Emacs Lisp.  */
 662 static Lisp_Object Vcoding_category_list;
 663
 664 /* Table of coding categories (Lisp symbols).  This variable is for
 665    internal use oly.  */
 666 static Lisp_Object Vcoding_category_table;
 667
 668 /* Table of coding-categories ordered by priority.  */
 669 static enum coding_category coding_priorities[coding_category_max];
 670
 671 /* Nth element is a coding context for the coding system bound to the
 672    Nth coding category.  */
 673 static struct coding_system coding_categories[coding_category_max];
 674
 675 /*** Commonly used macros and functions ***/
 676
 677 #ifndef min
 678 #define min(a, b) ((a) < (b) ? (a) : (b))
 679 #endif
 680 #ifndef max
 681 #define max(a, b) ((a) > (b) ? (a) : (b))
 682 #endif
 683
 684 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 685   do {                                                  \
 686     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 687     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 688   } while (0)
 689
 690
 691 /* Safely get one byte from the source text pointed by SRC which ends
 692    at SRC_END, and set C to that byte.  If there are not enough bytes
 693    in the source, it jumps to `no_more_source'.  If multibytep is
 694    nonzero, and a multibyte character is found at SRC, set C to the
 695    negative value of the character code.  The caller should declare
 696    and set these variables appropriately in advance:
 697         src, src_end, multibytep */
 698
 699 #define ONE_MORE_BYTE(c)                                \
 700   do {                                                  \
 701     if (src == src_end)                                 \
 702       {                                                 \
 703         if (src_base < src)                             \
 704           record_conversion_result                      \
 705             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 706         goto no_more_source;                            \
 707       }                                                 \
 708     c = *src++;                                         \
 709     if (multibytep && (c & 0x80))                       \
 710       {                                                 \
 711         if ((c & 0xFE) == 0xC0)                         \
 712           c = ((c & 1) << 6) | *src++;                  \
 713         else                                            \
 714           {                                             \
 715             src--;                                      \
 716             c = - string_char (src, &src, NULL);        \
 717             record_conversion_result                    \
 718               (coding, CODING_RESULT_INVALID_SRC);      \
 719           }                                             \
 720       }                                                 \
 721     consumed_chars++;                                   \
 722   } while (0)
 723
 724
 725 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 726   do {                                                  \
 727     c = *src++;                                         \
 728     if (multibytep && (c & 0x80))                       \
 729       {                                                 \
 730         if ((c & 0xFE) == 0xC0)                         \
 731           c = ((c & 1) << 6) | *src++;                  \
 732         else                                            \
 733           {                                             \
 734             src--;                                      \
 735             c = - string_char (src, &src, NULL);        \
 736             record_conversion_result                    \
 737               (coding, CODING_RESULT_INVALID_SRC);      \
 738           }                                             \
 739       }                                                 \
 740     consumed_chars++;                                   \
 741   } while (0)
 742
 743
 744 /* Store a byte C in the place pointed by DST and increment DST to the
 745    next free point, and increment PRODUCED_CHARS.  The caller should
 746    assure that C is 0..127, and declare and set the variable `dst'
 747    appropriately in advance.
 748 */
 749
 750
 751 #define EMIT_ONE_ASCII_BYTE(c)  \
 752   do {                          \
 753     produced_chars++;           \
 754     *dst++ = (c);               \
 755   } while (0)
 756
 757
 758 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 759
 760 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 761   do {                                  \
 762     produced_chars += 2;                \
 763     *dst++ = (c1), *dst++ = (c2);       \
 764   } while (0)
 765
 766
 767 /* Store a byte C in the place pointed by DST and increment DST to the
 768    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 769    nonzero, store in an appropriate multibyte from.  The caller should
 770    declare and set the variables `dst' and `multibytep' appropriately
 771    in advance.  */
 772
 773 #define EMIT_ONE_BYTE(c)                \
 774   do {                                  \
 775     produced_chars++;                   \
 776     if (multibytep)                     \
 777       {                                 \
 778         int ch = (c);                   \
 779         if (ch >= 0x80)                 \
 780           ch = BYTE8_TO_CHAR (ch);      \
 781         CHAR_STRING_ADVANCE (ch, dst);  \
 782       }                                 \
 783     else                                \
 784       *dst++ = (c);                     \
 785   } while (0)
 786
 787
 788 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 789
 790 #define EMIT_TWO_BYTES(c1, c2)          \
 791   do {                                  \
 792     produced_chars += 2;                \
 793     if (multibytep)                     \
 794       {                                 \
 795         int ch;                         \
 796                                         \
 797         ch = (c1);                      \
 798         if (ch >= 0x80)                 \
 799           ch = BYTE8_TO_CHAR (ch);      \
 800         CHAR_STRING_ADVANCE (ch, dst);  \
 801         ch = (c2);                      \
 802         if (ch >= 0x80)                 \
 803           ch = BYTE8_TO_CHAR (ch);      \
 804         CHAR_STRING_ADVANCE (ch, dst);  \
 805       }                                 \
 806     else                                \
 807       {                                 \
 808         *dst++ = (c1);                  \
 809         *dst++ = (c2);                  \
 810       }                                 \
 811   } while (0)
 812
 813
 814 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 815   do {                                  \
 816     EMIT_ONE_BYTE (c1);                 \
 817     EMIT_TWO_BYTES (c2, c3);            \
 818   } while (0)
 819
 820
 821 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 822   do {                                          \
 823     EMIT_TWO_BYTES (c1, c2);                    \
 824     EMIT_TWO_BYTES (c3, c4);                    \
 825   } while (0)
 826
 827
 828 /* Prototypes for static functions.  */
 829 static void record_conversion_result P_ ((struct coding_system *coding,
 830                                           enum coding_result_code result));
 831 static int detect_coding_utf_8 P_ ((struct coding_system *,
 832                                     struct coding_detection_info *info));
 833 static void decode_coding_utf_8 P_ ((struct coding_system *));
 834 static int encode_coding_utf_8 P_ ((struct coding_system *));
 835
 836 static int detect_coding_utf_16 P_ ((struct coding_system *,
 837                                      struct coding_detection_info *info));
 838 static void decode_coding_utf_16 P_ ((struct coding_system *));
 839 static int encode_coding_utf_16 P_ ((struct coding_system *));
 840
 841 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 842                                        struct coding_detection_info *info));
 843 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 844 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 845
 846 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 847                                          struct coding_detection_info *info));
 848 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 849 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 850
 851 static int detect_coding_sjis P_ ((struct coding_system *,
 852                                    struct coding_detection_info *info));
 853 static void decode_coding_sjis P_ ((struct coding_system *));
 854 static int encode_coding_sjis P_ ((struct coding_system *));
 855
 856 static int detect_coding_big5 P_ ((struct coding_system *,
 857                                    struct coding_detection_info *info));
 858 static void decode_coding_big5 P_ ((struct coding_system *));
 859 static int encode_coding_big5 P_ ((struct coding_system *));
 860
 861 static int detect_coding_ccl P_ ((struct coding_system *,
 862                                   struct coding_detection_info *info));
 863 static void decode_coding_ccl P_ ((struct coding_system *));
 864 static int encode_coding_ccl P_ ((struct coding_system *));
 865
 866 static void decode_coding_raw_text P_ ((struct coding_system *));
 867 static int encode_coding_raw_text P_ ((struct coding_system *));
 868
 869 static void coding_set_source P_ ((struct coding_system *));
 870 static void coding_set_destination P_ ((struct coding_system *));
 871 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 872 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 873                                             EMACS_INT));
 874 static unsigned char *alloc_destination P_ ((struct coding_system *,
 875                                              EMACS_INT, unsigned char *));
 876 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 877 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 878                                                      int *, int *,
 879                                                      unsigned char *));
 880 static int detect_eol P_ ((const unsigned char *,
 881                            EMACS_INT, enum coding_category));
 882 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 883 static void decode_eol P_ ((struct coding_system *));
 884 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 885 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 886                                         int, int *, int *));
 887 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 888 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 889                                             EMACS_INT));
 890 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 891                                         EMACS_INT));
 892 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 893 static int decode_coding P_ ((struct coding_system *));
 894 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 895                                                       struct coding_system *,
 896                                                       int *, EMACS_INT *));
 897 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 898                                                   struct coding_system *,
 899                                                   int *, EMACS_INT *));
 900 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 901 static int encode_coding P_ ((struct coding_system *));
 902 static Lisp_Object make_conversion_work_buffer P_ ((int));
 903 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 904 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 905 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 906
 907 static void
 908 record_conversion_result (struct coding_system *coding,
 909                           enum coding_result_code result)
 910 {
 911   coding->result = result;
 912   switch (result)
 913     {
 914     case CODING_RESULT_INSUFFICIENT_SRC:
 915       Vlast_code_conversion_error = Qinsufficient_source;
 916       break;
 917     case CODING_RESULT_INCONSISTENT_EOL:
 918       Vlast_code_conversion_error = Qinconsistent_eol;
 919       break;
 920     case CODING_RESULT_INVALID_SRC:
 921       Vlast_code_conversion_error = Qinvalid_source;
 922       break;
 923     case CODING_RESULT_INTERRUPT:
 924       Vlast_code_conversion_error = Qinterrupted;
 925       break;
 926     case CODING_RESULT_INSUFFICIENT_MEM:
 927       Vlast_code_conversion_error = Qinsufficient_memory;
 928       break;
 929     default:
 930       Vlast_code_conversion_error = intern ("Unknown error");
 931     }
 932 }
 933
 934 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 935   do {                                                                       \
 936     charset_map_loaded = 0;                                                  \
 937     c = DECODE_CHAR (charset, code);                                         \
 938     if (charset_map_loaded)                                                  \
 939       {                                                                      \
 940         const unsigned char *orig = coding->source;                          \
 941         EMACS_INT offset;                                                    \
 942                                                                              \
 943         coding_set_source (coding);                                          \
 944         offset = coding->source - orig;                                      \
 945         src += offset;                                                       \
 946         src_base += offset;                                                  \
 947         src_end += offset;                                                   \
 948       }                                                                      \
 949   } while (0)
 950
 951
 952 #define ASSURE_DESTINATION(bytes)                               \
 953   do {                                                          \
 954     if (dst + (bytes) >= dst_end)                               \
 955       {                                                         \
 956         int more_bytes = charbuf_end - charbuf + (bytes);       \
 957                                                                 \
 958         dst = alloc_destination (coding, more_bytes, dst);      \
 959         dst_end = coding->destination + coding->dst_bytes;      \
 960       }                                                         \
 961   } while (0)
 962
 963
 964
 965 static void
 966 coding_set_source (coding)
 967      struct coding_system *coding;
 968 {
 969   if (BUFFERP (coding->src_object))
 970     {
 971       struct buffer *buf = XBUFFER (coding->src_object);
 972
 973       if (coding->src_pos < 0)
 974         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 975       else
 976         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 977     }
 978   else if (STRINGP (coding->src_object))
 979     {
 980       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 981     }
 982   else
 983     /* Otherwise, the source is C string and is never relocated
 984        automatically.  Thus we don't have to update anything.  */
 985     ;
 986 }
 987
 988 static void
 989 coding_set_destination (coding)
 990      struct coding_system *coding;
 991 {
 992   if (BUFFERP (coding->dst_object))
 993     {
 994       if (coding->src_pos < 0)
 995         {
 996           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 997           coding->dst_bytes = (GAP_END_ADDR
 998                                - (coding->src_bytes - coding->consumed)
 999                                - coding->destination);
1000         }
1001       else
1002         {
1003           /* We are sure that coding->dst_pos_byte is before the gap
1004              of the buffer. */
1005           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
1006                                  + coding->dst_pos_byte - 1);
1007           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
1008                                - coding->destination);
1009         }
1010     }
1011   else
1012     /* Otherwise, the destination is C string and is never relocated
1013        automatically.  Thus we don't have to update anything.  */
1014     ;
1015 }
1016
1017
1018 static void
1019 coding_alloc_by_realloc (coding, bytes)
1020      struct coding_system *coding;
1021      EMACS_INT bytes;
1022 {
1023   coding->destination = (unsigned char *) xrealloc (coding->destination,
1024                                                     coding->dst_bytes + bytes);
1025   coding->dst_bytes += bytes;
1026 }
1027
1028 static void
1029 coding_alloc_by_making_gap (coding, bytes)
1030      struct coding_system *coding;
1031      EMACS_INT bytes;
1032 {
1033   if (BUFFERP (coding->dst_object)
1034       && EQ (coding->src_object, coding->dst_object))
1035     {
1036       EMACS_INT add = coding->src_bytes - coding->consumed;
1037
1038       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1039       make_gap (bytes);
1040       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1041     }
1042   else
1043     {
1044       Lisp_Object this_buffer;
1045
1046       this_buffer = Fcurrent_buffer ();
1047       set_buffer_internal (XBUFFER (coding->dst_object));
1048       make_gap (bytes);
1049       set_buffer_internal (XBUFFER (this_buffer));
1050     }
1051 }
1052
1053
1054 static unsigned char *
1055 alloc_destination (coding, nbytes, dst)
1056      struct coding_system *coding;
1057      EMACS_INT nbytes;
1058      unsigned char *dst;
1059 {
1060   EMACS_INT offset = dst - coding->destination;
1061
1062   if (BUFFERP (coding->dst_object))
1063     coding_alloc_by_making_gap (coding, nbytes);
1064   else
1065     coding_alloc_by_realloc (coding, nbytes);
1066   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1067   coding_set_destination (coding);
1068   dst = coding->destination + offset;
1069   return dst;
1070 }
1071
1072 /** Macros for annotations.  */
1073
1074 /* Maximum length of annotation data (sum of annotations for
1075    composition and charset).  */
1076 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1077
1078 /* An annotation data is stored in the array coding->charbuf in this
1079    format:
1080      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1081    LENGTH is the number of elements in the annotation.
1082    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1083    NCHARS is the number of characters in the text annotated.
1084
1085    The format of the following elements depend on ANNOTATION_MASK.
1086
1087    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1088    follows:
1089      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1090    METHOD is one of enum composition_method.
1091    Optionnal COMPOSITION-COMPONENTS are characters and composition
1092    rules.
1093
1094    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1095    follows.  */
1096
1097 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1098   do {                                                  \
1099     *(buf)++ = -(len);                                  \
1100     *(buf)++ = (mask);                                  \
1101     *(buf)++ = (nchars);                                \
1102     coding->annotated = 1;                              \
1103   } while (0);
1104
1105 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1106   do {                                                                      \
1107     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1108     *buf++ = method;                                                        \
1109   } while (0)
1110
1111
1112 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1113   do {                                                                  \
1114     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1115     *buf++ = id;                                                        \
1116   } while (0)
1117
1118 \f
1119 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1120
1121
1122
1123 \f
1124 /*** 3. UTF-8 ***/
1125
1126 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1127    Check if a text is encoded in UTF-8.  If it is, return 1, else
1128    return 0.  */
1129
1130 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1131 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1132 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1133 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1134 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1135 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1136
1137 static int
1138 detect_coding_utf_8 (coding, detect_info)
1139      struct coding_system *coding;
1140      struct coding_detection_info *detect_info;
1141 {
1142   const unsigned char *src = coding->source, *src_base;
1143   const unsigned char *src_end = coding->source + coding->src_bytes;
1144   int multibytep = coding->src_multibyte;
1145   int consumed_chars = 0;
1146   int found = 0;
1147
1148   detect_info->checked |= CATEGORY_MASK_UTF_8;
1149   /* A coding system of this category is always ASCII compatible.  */
1150   src += coding->head_ascii;
1151
1152   while (1)
1153     {
1154       int c, c1, c2, c3, c4;
1155
1156       src_base = src;
1157       ONE_MORE_BYTE (c);
1158       if (c < 0 || UTF_8_1_OCTET_P (c))
1159         continue;
1160       ONE_MORE_BYTE (c1);
1161       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1162         break;
1163       if (UTF_8_2_OCTET_LEADING_P (c))
1164         {
1165           found = CATEGORY_MASK_UTF_8;
1166           continue;
1167         }
1168       ONE_MORE_BYTE (c2);
1169       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1170         break;
1171       if (UTF_8_3_OCTET_LEADING_P (c))
1172         {
1173           found = CATEGORY_MASK_UTF_8;
1174           continue;
1175         }
1176       ONE_MORE_BYTE (c3);
1177       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1178         break;
1179       if (UTF_8_4_OCTET_LEADING_P (c))
1180         {
1181           found = CATEGORY_MASK_UTF_8;
1182           continue;
1183         }
1184       ONE_MORE_BYTE (c4);
1185       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1186         break;
1187       if (UTF_8_5_OCTET_LEADING_P (c))
1188         {
1189           found = CATEGORY_MASK_UTF_8;
1190           continue;
1191         }
1192       break;
1193     }
1194   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1195   return 0;
1196
1197  no_more_source:
1198   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1199     {
1200       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1201       return 0;
1202     }
1203   detect_info->found |= found;
1204   return 1;
1205 }
1206
1207
1208 static void
1209 decode_coding_utf_8 (coding)
1210      struct coding_system *coding;
1211 {
1212   const unsigned char *src = coding->source + coding->consumed;
1213   const unsigned char *src_end = coding->source + coding->src_bytes;
1214   const unsigned char *src_base;
1215   int *charbuf = coding->charbuf + coding->charbuf_used;
1216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1217   int consumed_chars = 0, consumed_chars_base;
1218   int multibytep = coding->src_multibyte;
1219   Lisp_Object attr, charset_list;
1220
1221   CODING_GET_INFO (coding, attr, charset_list);
1222
1223   while (1)
1224     {
1225       int c, c1, c2, c3, c4, c5;
1226
1227       src_base = src;
1228       consumed_chars_base = consumed_chars;
1229
1230       if (charbuf >= charbuf_end)
1231         break;
1232
1233       ONE_MORE_BYTE (c1);
1234       if (c1 < 0)
1235         {
1236           c = - c1;
1237         }
1238       else if (UTF_8_1_OCTET_P(c1))
1239         {
1240           c = c1;
1241         }
1242       else
1243         {
1244           ONE_MORE_BYTE (c2);
1245           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1246             goto invalid_code;
1247           if (UTF_8_2_OCTET_LEADING_P (c1))
1248             {
1249               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1250               /* Reject overlong sequences here and below.  Encoders
1251                  producing them are incorrect, they can be misleading,
1252                  and they mess up read/write invariance.  */
1253               if (c < 128)
1254                 goto invalid_code;
1255             }
1256           else
1257             {
1258               ONE_MORE_BYTE (c3);
1259               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1260                 goto invalid_code;
1261               if (UTF_8_3_OCTET_LEADING_P (c1))
1262                 {
1263                   c = (((c1 & 0xF) << 12)
1264                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1265                   if (c < 0x800
1266                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1267                     goto invalid_code;
1268                 }
1269               else
1270                 {
1271                   ONE_MORE_BYTE (c4);
1272                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1273                     goto invalid_code;
1274                   if (UTF_8_4_OCTET_LEADING_P (c1))
1275                     {
1276                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1277                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1278                     if (c < 0x10000)
1279                       goto invalid_code;
1280                     }
1281                   else
1282                     {
1283                       ONE_MORE_BYTE (c5);
1284                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1285                         goto invalid_code;
1286                       if (UTF_8_5_OCTET_LEADING_P (c1))
1287                         {
1288                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1289                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1290                                | (c5 & 0x3F));
1291                           if ((c > MAX_CHAR) || (c < 0x200000))
1292                             goto invalid_code;
1293                         }
1294                       else
1295                         goto invalid_code;
1296                     }
1297                 }
1298             }
1299         }
1300
1301       *charbuf++ = c;
1302       continue;
1303
1304     invalid_code:
1305       src = src_base;
1306       consumed_chars = consumed_chars_base;
1307       ONE_MORE_BYTE (c);
1308       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1309       coding->errors++;
1310     }
1311
1312  no_more_source:
1313   coding->consumed_char += consumed_chars_base;
1314   coding->consumed = src_base - coding->source;
1315   coding->charbuf_used = charbuf - coding->charbuf;
1316 }
1317
1318
1319 static int
1320 encode_coding_utf_8 (coding)
1321      struct coding_system *coding;
1322 {
1323   int multibytep = coding->dst_multibyte;
1324   int *charbuf = coding->charbuf;
1325   int *charbuf_end = charbuf + coding->charbuf_used;
1326   unsigned char *dst = coding->destination + coding->produced;
1327   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1328   int produced_chars = 0;
1329   int c;
1330
1331   if (multibytep)
1332     {
1333       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1334
1335       while (charbuf < charbuf_end)
1336         {
1337           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1338
1339           ASSURE_DESTINATION (safe_room);
1340           c = *charbuf++;
1341           if (CHAR_BYTE8_P (c))
1342             {
1343               c = CHAR_TO_BYTE8 (c);
1344               EMIT_ONE_BYTE (c);
1345             }
1346           else
1347             {
1348               CHAR_STRING_ADVANCE (c, pend);
1349               for (p = str; p < pend; p++)
1350                 EMIT_ONE_BYTE (*p);
1351             }
1352         }
1353     }
1354   else
1355     {
1356       int safe_room = MAX_MULTIBYTE_LENGTH;
1357
1358       while (charbuf < charbuf_end)
1359         {
1360           ASSURE_DESTINATION (safe_room);
1361           c = *charbuf++;
1362           if (CHAR_BYTE8_P (c))
1363             *dst++ = CHAR_TO_BYTE8 (c);
1364           else
1365             dst += CHAR_STRING (c, dst);
1366           produced_chars++;
1367         }
1368     }
1369   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1370   coding->produced_char += produced_chars;
1371   coding->produced = dst - coding->destination;
1372   return 0;
1373 }
1374
1375
1376 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1377    Check if a text is encoded in one of UTF-16 based coding systems.
1378    If it is, return 1, else return 0.  */
1379
1380 #define UTF_16_HIGH_SURROGATE_P(val) \
1381   (((val) & 0xFC00) == 0xD800)
1382
1383 #define UTF_16_LOW_SURROGATE_P(val) \
1384   (((val) & 0xFC00) == 0xDC00)
1385
1386 #define UTF_16_INVALID_P(val)   \
1387   (((val) == 0xFFFE)            \
1388    || ((val) == 0xFFFF)         \
1389    || UTF_16_LOW_SURROGATE_P (val))
1390
1391
1392 static int
1393 detect_coding_utf_16 (coding, detect_info)
1394      struct coding_system *coding;
1395      struct coding_detection_info *detect_info;
1396 {
1397   const unsigned char *src = coding->source, *src_base = src;
1398   const unsigned char *src_end = coding->source + coding->src_bytes;
1399   int multibytep = coding->src_multibyte;
1400   int consumed_chars = 0;
1401   int c1, c2;
1402
1403   detect_info->checked |= CATEGORY_MASK_UTF_16;
1404   if (coding->mode & CODING_MODE_LAST_BLOCK
1405       && (coding->src_chars & 1))
1406     {
1407       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1408       return 0;
1409     }
1410
1411   ONE_MORE_BYTE (c1);
1412   ONE_MORE_BYTE (c2);
1413   if ((c1 == 0xFF) && (c2 == 0xFE))
1414     {
1415       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1416                              | CATEGORY_MASK_UTF_16_AUTO);
1417       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1418                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1419                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1420     }
1421   else if ((c1 == 0xFE) && (c2 == 0xFF))
1422     {
1423       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1424                              | CATEGORY_MASK_UTF_16_AUTO);
1425       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1426                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1427                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1428     }
1429   else if (c1 >= 0 && c2 >= 0)
1430     {
1431       detect_info->rejected
1432         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1433     }
1434  no_more_source:
1435   return 1;
1436 }
1437
1438 static void
1439 decode_coding_utf_16 (coding)
1440      struct coding_system *coding;
1441 {
1442   const unsigned char *src = coding->source + coding->consumed;
1443   const unsigned char *src_end = coding->source + coding->src_bytes;
1444   const unsigned char *src_base;
1445   int *charbuf = coding->charbuf + coding->charbuf_used;
1446   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1447   int consumed_chars = 0, consumed_chars_base;
1448   int multibytep = coding->src_multibyte;
1449   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1450   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1451   int surrogate = CODING_UTF_16_SURROGATE (coding);
1452   Lisp_Object attr, charset_list;
1453
1454   CODING_GET_INFO (coding, attr, charset_list);
1455
1456   if (bom == utf_16_with_bom)
1457     {
1458       int c, c1, c2;
1459
1460       src_base = src;
1461       ONE_MORE_BYTE (c1);
1462       ONE_MORE_BYTE (c2);
1463       c = (c1 << 8) | c2;
1464
1465       if (endian == utf_16_big_endian
1466           ? c != 0xFEFF : c != 0xFFFE)
1467         {
1468           /* The first two bytes are not BOM.  Treat them as bytes
1469              for a normal character.  */
1470           src = src_base;
1471           coding->errors++;
1472         }
1473       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1474     }
1475   else if (bom == utf_16_detect_bom)
1476     {
1477       /* We have already tried to detect BOM and failed in
1478          detect_coding.  */
1479       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1480     }
1481
1482   while (1)
1483     {
1484       int c, c1, c2;
1485
1486       src_base = src;
1487       consumed_chars_base = consumed_chars;
1488
1489       if (charbuf + 2 >= charbuf_end)
1490         break;
1491
1492       ONE_MORE_BYTE (c1);
1493       if (c1 < 0)
1494         {
1495           *charbuf++ = -c1;
1496           continue;
1497         }
1498       ONE_MORE_BYTE (c2);
1499       if (c2 < 0)
1500         {
1501           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1502           *charbuf++ = -c2;
1503           continue;
1504         }
1505       c = (endian == utf_16_big_endian
1506            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1507       if (surrogate)
1508         {
1509           if (! UTF_16_LOW_SURROGATE_P (c))
1510             {
1511               if (endian == utf_16_big_endian)
1512                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1513               else
1514                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1515               *charbuf++ = c1;
1516               *charbuf++ = c2;
1517               coding->errors++;
1518               if (UTF_16_HIGH_SURROGATE_P (c))
1519                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1520               else
1521                 *charbuf++ = c;
1522             }
1523           else
1524             {
1525               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1526               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1527               *charbuf++ = 0x10000 + c;
1528             }
1529         }
1530       else
1531         {
1532           if (UTF_16_HIGH_SURROGATE_P (c))
1533             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1534           else
1535             *charbuf++ = c;
1536         }
1537     }
1538
1539  no_more_source:
1540   coding->consumed_char += consumed_chars_base;
1541   coding->consumed = src_base - coding->source;
1542   coding->charbuf_used = charbuf - coding->charbuf;
1543 }
1544
1545 static int
1546 encode_coding_utf_16 (coding)
1547      struct coding_system *coding;
1548 {
1549   int multibytep = coding->dst_multibyte;
1550   int *charbuf = coding->charbuf;
1551   int *charbuf_end = charbuf + coding->charbuf_used;
1552   unsigned char *dst = coding->destination + coding->produced;
1553   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1554   int safe_room = 8;
1555   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1556   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1557   int produced_chars = 0;
1558   Lisp_Object attrs, charset_list;
1559   int c;
1560
1561   CODING_GET_INFO (coding, attrs, charset_list);
1562
1563   if (bom != utf_16_without_bom)
1564     {
1565       ASSURE_DESTINATION (safe_room);
1566       if (big_endian)
1567         EMIT_TWO_BYTES (0xFE, 0xFF);
1568       else
1569         EMIT_TWO_BYTES (0xFF, 0xFE);
1570       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1571     }
1572
1573   while (charbuf < charbuf_end)
1574     {
1575       ASSURE_DESTINATION (safe_room);
1576       c = *charbuf++;
1577       if (c >= MAX_UNICODE_CHAR)
1578         c = coding->default_char;
1579
1580       if (c < 0x10000)
1581         {
1582           if (big_endian)
1583             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1584           else
1585             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1586         }
1587       else
1588         {
1589           int c1, c2;
1590
1591           c -= 0x10000;
1592           c1 = (c >> 10) + 0xD800;
1593           c2 = (c & 0x3FF) + 0xDC00;
1594           if (big_endian)
1595             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1596           else
1597             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1598         }
1599     }
1600   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1601   coding->produced = dst - coding->destination;
1602   coding->produced_char += produced_chars;
1603   return 0;
1604 }
1605
1606 \f
1607 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1608
1609 /* Emacs' internal format for representation of multiple character
1610    sets is a kind of multi-byte encoding, i.e. characters are
1611    represented by variable-length sequences of one-byte codes.
1612
1613    ASCII characters and control characters (e.g. `tab', `newline') are
1614    represented by one-byte sequences which are their ASCII codes, in
1615    the range 0x00 through 0x7F.
1616
1617    8-bit characters of the range 0x80..0x9F are represented by
1618    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1619    code + 0x20).
1620
1621    8-bit characters of the range 0xA0..0xFF are represented by
1622    one-byte sequences which are their 8-bit code.
1623
1624    The other characters are represented by a sequence of `base
1625    leading-code', optional `extended leading-code', and one or two
1626    `position-code's.  The length of the sequence is determined by the
1627    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1628    whereas extended leading-code and position-code take the range 0xA0
1629    through 0xFF.  See `charset.h' for more details about leading-code
1630    and position-code.
1631
1632    --- CODE RANGE of Emacs' internal format ---
1633    character set        range
1634    -------------        -----
1635    ascii                0x00..0x7F
1636    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1637    eight-bit-graphic    0xA0..0xBF
1638    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1639    ---------------------------------------------
1640
1641    As this is the internal character representation, the format is
1642    usually not used externally (i.e. in a file or in a data sent to a
1643    process).  But, it is possible to have a text externally in this
1644    format (i.e. by encoding by the coding system `emacs-mule').
1645
1646    In that case, a sequence of one-byte codes has a slightly different
1647    form.
1648
1649    At first, all characters in eight-bit-control are represented by
1650    one-byte sequences which are their 8-bit code.
1651
1652    Next, character composition data are represented by the byte
1653    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1654    where,
1655         METHOD is 0xF0 plus one of composition method (enum
1656         composition_method),
1657
1658         BYTES is 0xA0 plus a byte length of this composition data,
1659
1660         CHARS is 0x20 plus a number of characters composed by this
1661         data,
1662
1663         COMPONENTs are characters of multibye form or composition
1664         rules encoded by two-byte of ASCII codes.
1665
1666    In addition, for backward compatibility, the following formats are
1667    also recognized as composition data on decoding.
1668
1669    0x80 MSEQ ...
1670    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1671
1672    Here,
1673         MSEQ is a multibyte form but in these special format:
1674           ASCII: 0xA0 ASCII_CODE+0x80,
1675           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1676         RULE is a one byte code of the range 0xA0..0xF0 that
1677         represents a composition rule.
1678   */
1679
1680 char emacs_mule_bytes[256];
1681
1682 int
1683 emacs_mule_char (coding, src, nbytes, nchars, id)
1684      struct coding_system *coding;
1685      const unsigned char *src;
1686      int *nbytes, *nchars, *id;
1687 {
1688   const unsigned char *src_end = coding->source + coding->src_bytes;
1689   const unsigned char *src_base = src;
1690   int multibytep = coding->src_multibyte;
1691   struct charset *charset;
1692   unsigned code;
1693   int c;
1694   int consumed_chars = 0;
1695
1696   ONE_MORE_BYTE (c);
1697   if (c < 0)
1698     {
1699       c = -c;
1700       charset = emacs_mule_charset[0];
1701     }
1702   else
1703     {
1704       switch (emacs_mule_bytes[c])
1705         {
1706         case 2:
1707           if (! (charset = emacs_mule_charset[c]))
1708             goto invalid_code;
1709           ONE_MORE_BYTE (c);
1710           if (c < 0xA0)
1711             goto invalid_code;
1712           code = c & 0x7F;
1713           break;
1714
1715         case 3:
1716           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1717               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1718             {
1719               ONE_MORE_BYTE (c);
1720               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1721                 goto invalid_code;
1722               ONE_MORE_BYTE (c);
1723               if (c < 0xA0)
1724                 goto invalid_code;
1725               code = c & 0x7F;
1726             }
1727           else
1728             {
1729               if (! (charset = emacs_mule_charset[c]))
1730                 goto invalid_code;
1731               ONE_MORE_BYTE (c);
1732               if (c < 0xA0)
1733                 goto invalid_code;
1734               code = (c & 0x7F) << 8;
1735               ONE_MORE_BYTE (c);
1736               if (c < 0xA0)
1737                 goto invalid_code;
1738               code |= c & 0x7F;
1739             }
1740           break;
1741
1742         case 4:
1743           ONE_MORE_BYTE (c);
1744           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1745             goto invalid_code;
1746           ONE_MORE_BYTE (c);
1747           if (c < 0xA0)
1748             goto invalid_code;
1749           code = (c & 0x7F) << 8;
1750           ONE_MORE_BYTE (c);
1751           if (c < 0xA0)
1752             goto invalid_code;
1753           code |= c & 0x7F;
1754           break;
1755
1756         case 1:
1757           code = c;
1758           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1759                                      ? charset_ascii : charset_eight_bit);
1760           break;
1761
1762         default:
1763           abort ();
1764         }
1765       c = DECODE_CHAR (charset, code);
1766       if (c < 0)
1767         goto invalid_code;
1768     }
1769   *nbytes = src - src_base;
1770   *nchars = consumed_chars;
1771   if (id)
1772     *id = charset->id;
1773   return c;
1774
1775  no_more_source:
1776   return -2;
1777
1778  invalid_code:
1779   return -1;
1780 }
1781
1782
1783 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1784    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1785    else return 0.  */
1786
1787 static int
1788 detect_coding_emacs_mule (coding, detect_info)
1789      struct coding_system *coding;
1790      struct coding_detection_info *detect_info;
1791 {
1792   const unsigned char *src = coding->source, *src_base;
1793   const unsigned char *src_end = coding->source + coding->src_bytes;
1794   int multibytep = coding->src_multibyte;
1795   int consumed_chars = 0;
1796   int c;
1797   int found = 0;
1798
1799   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1800   /* A coding system of this category is always ASCII compatible.  */
1801   src += coding->head_ascii;
1802
1803   while (1)
1804     {
1805       src_base = src;
1806       ONE_MORE_BYTE (c);
1807       if (c < 0)
1808         continue;
1809       if (c == 0x80)
1810         {
1811           /* Perhaps the start of composite character.  We simple skip
1812              it because analyzing it is too heavy for detecting.  But,
1813              at least, we check that the composite character
1814              constitues of more than 4 bytes.  */
1815           const unsigned char *src_base;
1816
1817         repeat:
1818           src_base = src;
1819           do
1820             {
1821               ONE_MORE_BYTE (c);
1822             }
1823           while (c >= 0xA0);
1824
1825           if (src - src_base <= 4)
1826             break;
1827           found = CATEGORY_MASK_EMACS_MULE;
1828           if (c == 0x80)
1829             goto repeat;
1830         }
1831
1832       if (c < 0x80)
1833         {
1834           if (c < 0x20
1835               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1836             break;
1837         }
1838       else
1839         {
1840           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1841
1842           while (more_bytes > 0)
1843             {
1844               ONE_MORE_BYTE (c);
1845               if (c < 0xA0)
1846                 {
1847                   src--;        /* Unread the last byte.  */
1848                   break;
1849                 }
1850               more_bytes--;
1851             }
1852           if (more_bytes != 0)
1853             break;
1854           found = CATEGORY_MASK_EMACS_MULE;
1855         }
1856     }
1857   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1858   return 0;
1859
1860  no_more_source:
1861   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1862     {
1863       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1864       return 0;
1865     }
1866   detect_info->found |= found;
1867   return 1;
1868 }
1869
1870
1871 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1872
1873 /* Decode a character represented as a component of composition
1874    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1875    update SRC to the head of next character (or an encoded composition
1876    rule).  If SRC doesn't points a composition component, set C to -1.
1877    If SRC points an invalid byte sequence, global exit by a return
1878    value 0.  */
1879
1880 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1881   if (1)                                                        \
1882     {                                                           \
1883       int c;                                                    \
1884       int nbytes, nchars;                                       \
1885                                                                 \
1886       if (src == src_end)                                       \
1887         break;                                                  \
1888       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1889       if (c < 0)                                                \
1890         {                                                       \
1891           if (c == -2)                                          \
1892             break;                                              \
1893           goto invalid_code;                                    \
1894         }                                                       \
1895       *buf++ = c;                                               \
1896       src += nbytes;                                            \
1897       consumed_chars += nchars;                                 \
1898     }                                                           \
1899   else
1900
1901
1902 /* Decode a composition rule represented as a component of composition
1903    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1904    and increment BUF.  If SRC points an invalid byte sequence, set C
1905    to -1.  */
1906
1907 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1908   do {                                                  \
1909     int c, gref, nref;                                  \
1910                                                         \
1911     if (src >= src_end)                                 \
1912       goto invalid_code;                                \
1913     ONE_MORE_BYTE_NO_CHECK (c);                         \
1914     c -= 0x20;                                          \
1915     if (c < 0 || c >= 81)                               \
1916       goto invalid_code;                                \
1917                                                         \
1918     gref = c / 9, nref = c % 9;                         \
1919     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1920   } while (0)
1921
1922
1923 /* Decode a composition rule represented as a component of composition
1924    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1925    and increment BUF.  If SRC points an invalid byte sequence, set C
1926    to -1.  */
1927
1928 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1929   do {                                                  \
1930     int gref, nref;                                     \
1931                                                         \
1932     if (src + 1>= src_end)                              \
1933       goto invalid_code;                                \
1934     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1935     gref -= 0x20;                                       \
1936     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1937     nref -= 0x20;                                       \
1938     if (gref < 0 || gref >= 81                          \
1939         || nref < 0 || nref >= 81)                      \
1940       goto invalid_code;                                \
1941     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1942   } while (0)
1943
1944
1945 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1946   do {                                                                  \
1947     /* Emacs 21 style format.  The first three bytes at SRC are         \
1948        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1949        the byte length of this composition information, CHARS is the    \
1950        number of characters composed by this composition.  */           \
1951     enum composition_method method = c - 0xF2;                          \
1952     int *charbuf_base = charbuf;                                        \
1953     int consumed_chars_limit;                                           \
1954     int nbytes, nchars;                                                 \
1955                                                                         \
1956     ONE_MORE_BYTE (c);                                                  \
1957     if (c < 0)                                                          \
1958       goto invalid_code;                                                \
1959     nbytes = c - 0xA0;                                                  \
1960     if (nbytes < 3)                                                     \
1961       goto invalid_code;                                                \
1962     ONE_MORE_BYTE (c);                                                  \
1963     if (c < 0)                                                          \
1964       goto invalid_code;                                                \
1965     nchars = c - 0xA0;                                                  \
1966     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1967     consumed_chars_limit = consumed_chars_base + nbytes;                \
1968     if (method != COMPOSITION_RELATIVE)                                 \
1969       {                                                                 \
1970         int i = 0;                                                      \
1971         while (consumed_chars < consumed_chars_limit)                   \
1972           {                                                             \
1973             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1974               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1975             else                                                        \
1976               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1977             i++;                                                        \
1978           }                                                             \
1979         if (consumed_chars < consumed_chars_limit)                      \
1980           goto invalid_code;                                            \
1981         charbuf_base[0] -= i;                                           \
1982       }                                                                 \
1983   } while (0)
1984
1985
1986 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1987   do {                                                          \
1988     /* Emacs 20 style format for relative composition.  */      \
1989     /* Store multibyte form of characters to be composed.  */   \
1990     enum composition_method method = COMPOSITION_RELATIVE;      \
1991     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1992     int *buf = components;                                      \
1993     int i, j;                                                   \
1994                                                                 \
1995     src = src_base;                                             \
1996     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1997     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1998       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1999     if (i < 2)                                                  \
2000       goto invalid_code;                                        \
2001     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2002     for (j = 0; j < i; j++)                                     \
2003       *charbuf++ = components[j];                               \
2004   } while (0)
2005
2006
2007 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2008   do {                                                          \
2009     /* Emacs 20 style format for rule-base composition.  */     \
2010     /* Store multibyte form of characters to be composed.  */   \
2011     enum composition_method method = COMPOSITION_WITH_RULE;     \
2012     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2013     int *buf = components;                                      \
2014     int i, j;                                                   \
2015                                                                 \
2016     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2017     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2018       {                                                         \
2019         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2020         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2021       }                                                         \
2022     if (i < 1 || (buf - components) % 2 == 0)                   \
2023       goto invalid_code;                                        \
2024     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2025       goto no_more_source;                                      \
2026     ADD_COMPOSITION_DATA (buf, i, method);                      \
2027     for (j = 0; j < i; j++)                                     \
2028       *charbuf++ = components[j];                               \
2029     for (j = 0; j < i; j += 2)                                  \
2030       *charbuf++ = components[j];                               \
2031   } while (0)
2032
2033
2034 static void
2035 decode_coding_emacs_mule (coding)
2036      struct coding_system *coding;
2037 {
2038   const unsigned char *src = coding->source + coding->consumed;
2039   const unsigned char *src_end = coding->source + coding->src_bytes;
2040   const unsigned char *src_base;
2041   int *charbuf = coding->charbuf + coding->charbuf_used;
2042   int *charbuf_end
2043     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2044   int consumed_chars = 0, consumed_chars_base;
2045   int multibytep = coding->src_multibyte;
2046   Lisp_Object attrs, charset_list;
2047   int char_offset = coding->produced_char;
2048   int last_offset = char_offset;
2049   int last_id = charset_ascii;
2050
2051   CODING_GET_INFO (coding, attrs, charset_list);
2052
2053   while (1)
2054     {
2055       int c;
2056
2057       src_base = src;
2058       consumed_chars_base = consumed_chars;
2059
2060       if (charbuf >= charbuf_end)
2061         break;
2062
2063       ONE_MORE_BYTE (c);
2064       if (c < 0)
2065         {
2066           *charbuf++ = -c;
2067           char_offset++;
2068         }
2069       else if (c < 0x80)
2070         {
2071           *charbuf++ = c;
2072           char_offset++;
2073         }
2074       else if (c == 0x80)
2075         {
2076           ONE_MORE_BYTE (c);
2077           if (c < 0)
2078             goto invalid_code;
2079           if (c - 0xF2 >= COMPOSITION_RELATIVE
2080               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2081             DECODE_EMACS_MULE_21_COMPOSITION (c);
2082           else if (c < 0xC0)
2083             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2084           else if (c == 0xFF)
2085             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2086           else
2087             goto invalid_code;
2088         }
2089       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2090         {
2091           int nbytes, nchars;
2092           int id;
2093
2094           src = src_base;
2095           consumed_chars = consumed_chars_base;
2096           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2097           if (c < 0)
2098             {
2099               if (c == -2)
2100                 break;
2101               goto invalid_code;
2102             }
2103           if (last_id != id)
2104             {
2105               if (last_id != charset_ascii)
2106                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2107               last_id = id;
2108               last_offset = char_offset;
2109             }
2110           *charbuf++ = c;
2111           src += nbytes;
2112           consumed_chars += nchars;
2113           char_offset++;
2114         }
2115       continue;
2116
2117     invalid_code:
2118       src = src_base;
2119       consumed_chars = consumed_chars_base;
2120       ONE_MORE_BYTE (c);
2121       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2122       char_offset++;
2123       coding->errors++;
2124     }
2125
2126  no_more_source:
2127   if (last_id != charset_ascii)
2128     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2129   coding->consumed_char += consumed_chars_base;
2130   coding->consumed = src_base - coding->source;
2131   coding->charbuf_used = charbuf - coding->charbuf;
2132 }
2133
2134
2135 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2136   do {                                          \
2137     if (id < 0xA0)                              \
2138       codes[0] = id, codes[1] = 0;              \
2139     else if (id < 0xE0)                         \
2140       codes[0] = 0x9A, codes[1] = id;           \
2141     else if (id < 0xF0)                         \
2142       codes[0] = 0x9B, codes[1] = id;           \
2143     else if (id < 0xF5)                         \
2144       codes[0] = 0x9C, codes[1] = id;           \
2145     else                                        \
2146       codes[0] = 0x9D, codes[1] = id;           \
2147   } while (0);
2148
2149
2150 static int
2151 encode_coding_emacs_mule (coding)
2152      struct coding_system *coding;
2153 {
2154   int multibytep = coding->dst_multibyte;
2155   int *charbuf = coding->charbuf;
2156   int *charbuf_end = charbuf + coding->charbuf_used;
2157   unsigned char *dst = coding->destination + coding->produced;
2158   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2159   int safe_room = 8;
2160   int produced_chars = 0;
2161   Lisp_Object attrs, charset_list;
2162   int c;
2163   int preferred_charset_id = -1;
2164
2165   CODING_GET_INFO (coding, attrs, charset_list);
2166   if (! EQ (charset_list, Vemacs_mule_charset_list))
2167     {
2168       CODING_ATTR_CHARSET_LIST (attrs)
2169         = charset_list = Vemacs_mule_charset_list;
2170     }
2171
2172   while (charbuf < charbuf_end)
2173     {
2174       ASSURE_DESTINATION (safe_room);
2175       c = *charbuf++;
2176
2177       if (c < 0)
2178         {
2179           /* Handle an annotation.  */
2180           switch (*charbuf)
2181             {
2182             case CODING_ANNOTATE_COMPOSITION_MASK:
2183               /* Not yet implemented.  */
2184               break;
2185             case CODING_ANNOTATE_CHARSET_MASK:
2186               preferred_charset_id = charbuf[3];
2187               if (preferred_charset_id >= 0
2188                   && NILP (Fmemq (make_number (preferred_charset_id),
2189                                   charset_list)))
2190                 preferred_charset_id = -1;
2191               break;
2192             default:
2193               abort ();
2194             }
2195           charbuf += -c - 1;
2196           continue;
2197         }
2198
2199       if (ASCII_CHAR_P (c))
2200         EMIT_ONE_ASCII_BYTE (c);
2201       else if (CHAR_BYTE8_P (c))
2202         {
2203           c = CHAR_TO_BYTE8 (c);
2204           EMIT_ONE_BYTE (c);
2205         }
2206       else
2207         {
2208           struct charset *charset;
2209           unsigned code;
2210           int dimension;
2211           int emacs_mule_id;
2212           unsigned char leading_codes[2];
2213
2214           if (preferred_charset_id >= 0)
2215             {
2216               charset = CHARSET_FROM_ID (preferred_charset_id);
2217               if (! CHAR_CHARSET_P (c, charset))
2218                 charset = char_charset (c, charset_list, NULL);
2219             }
2220           else
2221             charset = char_charset (c, charset_list, &code);
2222           if (! charset)
2223             {
2224               c = coding->default_char;
2225               if (ASCII_CHAR_P (c))
2226                 {
2227                   EMIT_ONE_ASCII_BYTE (c);
2228                   continue;
2229                 }
2230               charset = char_charset (c, charset_list, &code);
2231             }
2232           dimension = CHARSET_DIMENSION (charset);
2233           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2234           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2235           EMIT_ONE_BYTE (leading_codes[0]);
2236           if (leading_codes[1])
2237             EMIT_ONE_BYTE (leading_codes[1]);
2238           if (dimension == 1)
2239             EMIT_ONE_BYTE (code | 0x80);
2240           else
2241             {
2242               code |= 0x8080;
2243               EMIT_ONE_BYTE (code >> 8);
2244               EMIT_ONE_BYTE (code & 0xFF);
2245             }
2246         }
2247     }
2248   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2249   coding->produced_char += produced_chars;
2250   coding->produced = dst - coding->destination;
2251   return 0;
2252 }
2253
2254 \f
2255 /*** 7. ISO2022 handlers ***/
2256
2257 /* The following note describes the coding system ISO2022 briefly.
2258    Since the intention of this note is to help understand the
2259    functions in this file, some parts are NOT ACCURATE or are OVERLY
2260    SIMPLIFIED.  For thorough understanding, please refer to the
2261    original document of ISO2022.  This is equivalent to the standard
2262    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2263
2264    ISO2022 provides many mechanisms to encode several character sets
2265    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2266    is encoded using bytes less than 128.  This may make the encoded
2267    text a little bit longer, but the text passes more easily through
2268    several types of gateway, some of which strip off the MSB (Most
2269    Significant Bit).
2270
2271    There are two kinds of character sets: control character sets and
2272    graphic character sets.  The former contain control characters such
2273    as `newline' and `escape' to provide control functions (control
2274    functions are also provided by escape sequences).  The latter
2275    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2276    two control character sets and many graphic character sets.
2277
2278    Graphic character sets are classified into one of the following
2279    four classes, according to the number of bytes (DIMENSION) and
2280    number of characters in one dimension (CHARS) of the set:
2281    - DIMENSION1_CHARS94
2282    - DIMENSION1_CHARS96
2283    - DIMENSION2_CHARS94
2284    - DIMENSION2_CHARS96
2285
2286    In addition, each character set is assigned an identification tag,
2287    unique for each set, called the "final character" (denoted as <F>
2288    hereafter).  The <F> of each character set is decided by ECMA(*)
2289    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2290    (0x30..0x3F are for private use only).
2291
2292    Note (*): ECMA = European Computer Manufacturers Association
2293
2294    Here are examples of graphic character sets [NAME(<F>)]:
2295         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2296         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2297         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2298         o DIMENSION2_CHARS96 -- none for the moment
2299
2300    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2301         C0 [0x00..0x1F] -- control character plane 0
2302         GL [0x20..0x7F] -- graphic character plane 0
2303         C1 [0x80..0x9F] -- control character plane 1
2304         GR [0xA0..0xFF] -- graphic character plane 1
2305
2306    A control character set is directly designated and invoked to C0 or
2307    C1 by an escape sequence.  The most common case is that:
2308    - ISO646's  control character set is designated/invoked to C0, and
2309    - ISO6429's control character set is designated/invoked to C1,
2310    and usually these designations/invocations are omitted in encoded
2311    text.  In a 7-bit environment, only C0 can be used, and a control
2312    character for C1 is encoded by an appropriate escape sequence to
2313    fit into the environment.  All control characters for C1 are
2314    defined to have corresponding escape sequences.
2315
2316    A graphic character set is at first designated to one of four
2317    graphic registers (G0 through G3), then these graphic registers are
2318    invoked to GL or GR.  These designations and invocations can be
2319    done independently.  The most common case is that G0 is invoked to
2320    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2321    these invocations and designations are omitted in encoded text.
2322    In a 7-bit environment, only GL can be used.
2323
2324    When a graphic character set of CHARS94 is invoked to GL, codes
2325    0x20 and 0x7F of the GL area work as control characters SPACE and
2326    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2327    be used.
2328
2329    There are two ways of invocation: locking-shift and single-shift.
2330    With locking-shift, the invocation lasts until the next different
2331    invocation, whereas with single-shift, the invocation affects the
2332    following character only and doesn't affect the locking-shift
2333    state.  Invocations are done by the following control characters or
2334    escape sequences:
2335
2336    ----------------------------------------------------------------------
2337    abbrev  function                  cntrl escape seq   description
2338    ----------------------------------------------------------------------
2339    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2340    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2341    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2342    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2343    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2344    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2345    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2346    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2347    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2348    ----------------------------------------------------------------------
2349    (*) These are not used by any known coding system.
2350
2351    Control characters for these functions are defined by macros
2352    ISO_CODE_XXX in `coding.h'.
2353
2354    Designations are done by the following escape sequences:
2355    ----------------------------------------------------------------------
2356    escape sequence      description
2357    ----------------------------------------------------------------------
2358    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2359    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2360    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2361    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2362    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2363    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2364    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2365    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2366    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2367    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2368    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2369    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2370    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2371    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2372    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2373    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2374    ----------------------------------------------------------------------
2375
2376    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2377    of dimension 1, chars 94, and final character <F>, etc...
2378
2379    Note (*): Although these designations are not allowed in ISO2022,
2380    Emacs accepts them on decoding, and produces them on encoding
2381    CHARS96 character sets in a coding system which is characterized as
2382    7-bit environment, non-locking-shift, and non-single-shift.
2383
2384    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2385    '(' must be omitted.  We refer to this as "short-form" hereafter.
2386
2387    Now you may notice that there are a lot of ways of encoding the
2388    same multilingual text in ISO2022.  Actually, there exist many
2389    coding systems such as Compound Text (used in X11's inter client
2390    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2391    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2392    localized platforms), and all of these are variants of ISO2022.
2393
2394    In addition to the above, Emacs handles two more kinds of escape
2395    sequences: ISO6429's direction specification and Emacs' private
2396    sequence for specifying character composition.
2397
2398    ISO6429's direction specification takes the following form:
2399         o CSI ']'      -- end of the current direction
2400         o CSI '0' ']'  -- end of the current direction
2401         o CSI '1' ']'  -- start of left-to-right text
2402         o CSI '2' ']'  -- start of right-to-left text
2403    The control character CSI (0x9B: control sequence introducer) is
2404    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2405
2406    Character composition specification takes the following form:
2407         o ESC '0' -- start relative composition
2408         o ESC '1' -- end composition
2409         o ESC '2' -- start rule-base composition (*)
2410         o ESC '3' -- start relative composition with alternate chars  (**)
2411         o ESC '4' -- start rule-base composition with alternate chars  (**)
2412   Since these are not standard escape sequences of any ISO standard,
2413   the use of them with these meanings is restricted to Emacs only.
2414
2415   (*) This form is used only in Emacs 20.7 and older versions,
2416   but newer versions can safely decode it.
2417   (**) This form is used only in Emacs 21.1 and newer versions,
2418   and older versions can't decode it.
2419
2420   Here's a list of example usages of these composition escape
2421   sequences (categorized by `enum composition_method').
2422
2423   COMPOSITION_RELATIVE:
2424         ESC 0 CHAR [ CHAR ] ESC 1
2425   COMPOSITION_WITH_RULE:
2426         ESC 2 CHAR [ RULE CHAR ] ESC 1
2427   COMPOSITION_WITH_ALTCHARS:
2428         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2429   COMPOSITION_WITH_RULE_ALTCHARS:
2430         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2431
2432 enum iso_code_class_type iso_code_class[256];
2433
2434 #define SAFE_CHARSET_P(coding, id)      \
2435   ((id) <= (coding)->max_charset_id     \
2436    && (coding)->safe_charsets[id] >= 0)
2437
2438
2439 #define SHIFT_OUT_OK(category)  \
2440   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2441
2442 static void
2443 setup_iso_safe_charsets (attrs)
2444      Lisp_Object attrs;
2445 {
2446   Lisp_Object charset_list, safe_charsets;
2447   Lisp_Object request;
2448   Lisp_Object reg_usage;
2449   Lisp_Object tail;
2450   int reg94, reg96;
2451   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2452   int max_charset_id;
2453
2454   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2455   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2456       && ! EQ (charset_list, Viso_2022_charset_list))
2457     {
2458       CODING_ATTR_CHARSET_LIST (attrs)
2459         = charset_list = Viso_2022_charset_list;
2460       ASET (attrs, coding_attr_safe_charsets, Qnil);
2461     }
2462
2463   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2464     return;
2465
2466   max_charset_id = 0;
2467   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2468     {
2469       int id = XINT (XCAR (tail));
2470       if (max_charset_id < id)
2471         max_charset_id = id;
2472     }
2473
2474   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2475                                 make_number (255));
2476   request = AREF (attrs, coding_attr_iso_request);
2477   reg_usage = AREF (attrs, coding_attr_iso_usage);
2478   reg94 = XINT (XCAR (reg_usage));
2479   reg96 = XINT (XCDR (reg_usage));
2480
2481   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2482     {
2483       Lisp_Object id;
2484       Lisp_Object reg;
2485       struct charset *charset;
2486
2487       id = XCAR (tail);
2488       charset = CHARSET_FROM_ID (XINT (id));
2489       reg = Fcdr (Fassq (id, request));
2490       if (! NILP (reg))
2491         SSET (safe_charsets, XINT (id), XINT (reg));
2492       else if (charset->iso_chars_96)
2493         {
2494           if (reg96 < 4)
2495             SSET (safe_charsets, XINT (id), reg96);
2496         }
2497       else
2498         {
2499           if (reg94 < 4)
2500             SSET (safe_charsets, XINT (id), reg94);
2501         }
2502     }
2503   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2504 }
2505
2506
2507 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2508    Check if a text is encoded in one of ISO-2022 based codig systems.
2509    If it is, return 1, else return 0.  */
2510
2511 static int
2512 detect_coding_iso_2022 (coding, detect_info)
2513      struct coding_system *coding;
2514      struct coding_detection_info *detect_info;
2515 {
2516   const unsigned char *src = coding->source, *src_base = src;
2517   const unsigned char *src_end = coding->source + coding->src_bytes;
2518   int multibytep = coding->src_multibyte;
2519   int single_shifting = 0;
2520   int id;
2521   int c, c1;
2522   int consumed_chars = 0;
2523   int i;
2524   int rejected = 0;
2525   int found = 0;
2526
2527   detect_info->checked |= CATEGORY_MASK_ISO;
2528
2529   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2530     {
2531       struct coding_system *this = &(coding_categories[i]);
2532       Lisp_Object attrs, val;
2533
2534       attrs = CODING_ID_ATTRS (this->id);
2535       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2536           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2537         setup_iso_safe_charsets (attrs);
2538       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2539       this->max_charset_id = SCHARS (val) - 1;
2540       this->safe_charsets = (char *) SDATA (val);
2541     }
2542
2543   /* A coding system of this category is always ASCII compatible.  */
2544   src += coding->head_ascii;
2545
2546   while (rejected != CATEGORY_MASK_ISO)
2547     {
2548       src_base = src;
2549       ONE_MORE_BYTE (c);
2550       switch (c)
2551         {
2552         case ISO_CODE_ESC:
2553           if (inhibit_iso_escape_detection)
2554             break;
2555           single_shifting = 0;
2556           ONE_MORE_BYTE (c);
2557           if (c >= '(' && c <= '/')
2558             {
2559               /* Designation sequence for a charset of dimension 1.  */
2560               ONE_MORE_BYTE (c1);
2561               if (c1 < ' ' || c1 >= 0x80
2562                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2563                 /* Invalid designation sequence.  Just ignore.  */
2564                 break;
2565             }
2566           else if (c == '$')
2567             {
2568               /* Designation sequence for a charset of dimension 2.  */
2569               ONE_MORE_BYTE (c);
2570               if (c >= '@' && c <= 'B')
2571                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2572                 id = iso_charset_table[1][0][c];
2573               else if (c >= '(' && c <= '/')
2574                 {
2575                   ONE_MORE_BYTE (c1);
2576                   if (c1 < ' ' || c1 >= 0x80
2577                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2578                     /* Invalid designation sequence.  Just ignore.  */
2579                     break;
2580                 }
2581               else
2582                 /* Invalid designation sequence.  Just ignore it.  */
2583                 break;
2584             }
2585           else if (c == 'N' || c == 'O')
2586             {
2587               /* ESC <Fe> for SS2 or SS3.  */
2588               single_shifting = 1;
2589               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2590               break;
2591             }
2592           else if (c >= '0' && c <= '4')
2593             {
2594               /* ESC <Fp> for start/end composition.  */
2595               found |= CATEGORY_MASK_ISO;
2596               break;
2597             }
2598           else
2599             {
2600               /* Invalid escape sequence.  Just ignore it.  */
2601               break;
2602             }
2603
2604           /* We found a valid designation sequence for CHARSET.  */
2605           rejected |= CATEGORY_MASK_ISO_8BIT;
2606           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2607                               id))
2608             found |= CATEGORY_MASK_ISO_7;
2609           else
2610             rejected |= CATEGORY_MASK_ISO_7;
2611           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2612                               id))
2613             found |= CATEGORY_MASK_ISO_7_TIGHT;
2614           else
2615             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2616           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2617                               id))
2618             found |= CATEGORY_MASK_ISO_7_ELSE;
2619           else
2620             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2621           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2622                               id))
2623             found |= CATEGORY_MASK_ISO_8_ELSE;
2624           else
2625             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2626           break;
2627
2628         case ISO_CODE_SO:
2629         case ISO_CODE_SI:
2630           /* Locking shift out/in.  */
2631           if (inhibit_iso_escape_detection)
2632             break;
2633           single_shifting = 0;
2634           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2635           found |= CATEGORY_MASK_ISO_ELSE;
2636           break;
2637
2638         case ISO_CODE_CSI:
2639           /* Control sequence introducer.  */
2640           single_shifting = 0;
2641           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2642           found |= CATEGORY_MASK_ISO_8_ELSE;
2643           goto check_extra_latin;
2644
2645         case ISO_CODE_SS2:
2646         case ISO_CODE_SS3:
2647           /* Single shift.   */
2648           if (inhibit_iso_escape_detection)
2649             break;
2650           single_shifting = 0;
2651           rejected |= CATEGORY_MASK_ISO_7BIT;
2652           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2653               & CODING_ISO_FLAG_SINGLE_SHIFT)
2654             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2655           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2656               & CODING_ISO_FLAG_SINGLE_SHIFT)
2657             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2658           if (single_shifting)
2659             break;
2660           goto check_extra_latin;
2661
2662         default:
2663           if (c < 0)
2664             continue;
2665           if (c < 0x80)
2666             {
2667               single_shifting = 0;
2668               break;
2669             }
2670           if (c >= 0xA0)
2671             {
2672               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2673               found |= CATEGORY_MASK_ISO_8_1;
2674               /* Check the length of succeeding codes of the range
2675                  0xA0..0FF.  If the byte length is even, we include
2676                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2677                  only when we are not single shifting.  */
2678               if (! single_shifting
2679                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2680                 {
2681                   int i = 1;
2682                   while (src < src_end)
2683                     {
2684                       ONE_MORE_BYTE (c);
2685                       if (c < 0xA0)
2686                         break;
2687                       i++;
2688                     }
2689
2690                   if (i & 1 && src < src_end)
2691                     rejected |= CATEGORY_MASK_ISO_8_2;
2692                   else
2693                     found |= CATEGORY_MASK_ISO_8_2;
2694                 }
2695               break;
2696             }
2697         check_extra_latin:
2698           single_shifting = 0;
2699           if (! VECTORP (Vlatin_extra_code_table)
2700               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2701             {
2702               rejected = CATEGORY_MASK_ISO;
2703               break;
2704             }
2705           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2706               & CODING_ISO_FLAG_LATIN_EXTRA)
2707             found |= CATEGORY_MASK_ISO_8_1;
2708           else
2709             rejected |= CATEGORY_MASK_ISO_8_1;
2710           rejected |= CATEGORY_MASK_ISO_8_2;
2711         }
2712     }
2713   detect_info->rejected |= CATEGORY_MASK_ISO;
2714   return 0;
2715
2716  no_more_source:
2717   detect_info->rejected |= rejected;
2718   detect_info->found |= (found & ~rejected);
2719   return 1;
2720 }
2721
2722
2723 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2724    escape sequence should be kept.  */
2725 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2726   do {                                                                  \
2727     int id, prev;                                                       \
2728                                                                         \
2729     if (final < '0' || final >= 128                                     \
2730         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2731         || !SAFE_CHARSET_P (coding, id))                                \
2732       {                                                                 \
2733         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2734         chars_96 = -1;                                                  \
2735         break;                                                          \
2736       }                                                                 \
2737     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2738     if (id == charset_jisx0201_roman)                                   \
2739       {                                                                 \
2740         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2741           id = charset_ascii;                                           \
2742       }                                                                 \
2743     else if (id == charset_jisx0208_1978)                               \
2744       {                                                                 \
2745         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2746           id = charset_jisx0208;                                        \
2747       }                                                                 \
2748     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2749     /* If there was an invalid designation to REG previously, and this  \
2750        designation is ASCII to REG, we should keep this designation     \
2751        sequence.  */                                                    \
2752     if (prev == -2 && id == charset_ascii)                              \
2753       chars_96 = -1;                                                    \
2754   } while (0)
2755
2756
2757 #define MAYBE_FINISH_COMPOSITION()                              \
2758   do {                                                          \
2759     int i;                                                      \
2760     if (composition_state == COMPOSING_NO)                      \
2761       break;                                                    \
2762     /* It is assured that we have enough room for producing     \
2763        characters stored in the table `components'.  */         \
2764     if (charbuf + component_idx > charbuf_end)                  \
2765       goto no_more_source;                                      \
2766     composition_state = COMPOSING_NO;                           \
2767     if (method == COMPOSITION_RELATIVE                          \
2768         || method == COMPOSITION_WITH_ALTCHARS)                 \
2769       {                                                         \
2770         for (i = 0; i < component_idx; i++)                     \
2771           *charbuf++ = components[i];                           \
2772         char_offset += component_idx;                           \
2773       }                                                         \
2774     else                                                        \
2775       {                                                         \
2776         for (i = 0; i < component_idx; i += 2)                  \
2777           *charbuf++ = components[i];                           \
2778         char_offset += (component_idx / 2) + 1;                 \
2779       }                                                         \
2780   } while (0)
2781
2782
2783 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2784    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2785    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2786    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2787    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2788   */
2789
2790 #define DECODE_COMPOSITION_START(c1)                                    \
2791   do {                                                                  \
2792     if (c1 == '0'                                                       \
2793         && composition_state == COMPOSING_COMPONENT_RULE)               \
2794       {                                                                 \
2795         component_len = component_idx;                                  \
2796         composition_state = COMPOSING_CHAR;                             \
2797       }                                                                 \
2798     else                                                                \
2799       {                                                                 \
2800         const unsigned char *p;                                         \
2801                                                                         \
2802         MAYBE_FINISH_COMPOSITION ();                                    \
2803         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2804           goto no_more_source;                                          \
2805         for (p = src; p < src_end - 1; p++)                             \
2806           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2807             break;                                                      \
2808         if (p == src_end - 1)                                           \
2809           {                                                             \
2810             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2811               goto invalid_code;                                        \
2812             goto no_more_source;                                        \
2813           }                                                             \
2814                                                                         \
2815         /* This is surely the start of a composition.  */               \
2816         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2817                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2818                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2819                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2820         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2821                              : COMPOSING_COMPONENT_CHAR);               \
2822         component_idx = component_len = 0;                              \
2823       }                                                                 \
2824   } while (0)
2825
2826
2827 /* Handle compositoin end sequence ESC 1.  */
2828
2829 #define DECODE_COMPOSITION_END()                                        \
2830   do {                                                                  \
2831     int nchars = (component_len > 0 ? component_idx - component_len     \
2832                   : method == COMPOSITION_RELATIVE ? component_idx      \
2833                   : (component_idx + 1) / 2);                           \
2834     int i;                                                              \
2835     int *saved_charbuf = charbuf;                                       \
2836                                                                         \
2837     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2838     if (method != COMPOSITION_RELATIVE)                                 \
2839       {                                                                 \
2840         if (component_len == 0)                                         \
2841           for (i = 0; i < component_idx; i++)                           \
2842             *charbuf++ = components[i];                                 \
2843         else                                                            \
2844           for (i = 0; i < component_len; i++)                           \
2845             *charbuf++ = components[i];                                 \
2846         *saved_charbuf = saved_charbuf - charbuf;                       \
2847       }                                                                 \
2848     if (method == COMPOSITION_WITH_RULE)                                \
2849       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2850         *charbuf++ = components[i];                                     \
2851     else                                                                \
2852       for (i = component_len; i < component_idx; i++, char_offset++)    \
2853         *charbuf++ = components[i];                                     \
2854     coding->annotated = 1;                                              \
2855     composition_state = COMPOSING_NO;                                   \
2856   } while (0)
2857
2858
2859 /* Decode a composition rule from the byte C1 (and maybe one more byte
2860    from SRC) and store one encoded composition rule in
2861    coding->cmp_data.  */
2862
2863 #define DECODE_COMPOSITION_RULE(c1)                                     \
2864   do {                                                                  \
2865     (c1) -= 32;                                                         \
2866     if (c1 < 81)                /* old format (before ver.21) */        \
2867       {                                                                 \
2868         int gref = (c1) / 9;                                            \
2869         int nref = (c1) % 9;                                            \
2870         if (gref == 4) gref = 10;                                       \
2871         if (nref == 4) nref = 10;                                       \
2872         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2873       }                                                                 \
2874     else if (c1 < 93)           /* new format (after ver.21) */         \
2875       {                                                                 \
2876         ONE_MORE_BYTE (c2);                                             \
2877         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2878       }                                                                 \
2879     else                                                                \
2880       c1 = 0;                                                           \
2881   } while (0)
2882
2883
2884 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2885
2886 static void
2887 decode_coding_iso_2022 (coding)
2888      struct coding_system *coding;
2889 {
2890   const unsigned char *src = coding->source + coding->consumed;
2891   const unsigned char *src_end = coding->source + coding->src_bytes;
2892   const unsigned char *src_base;
2893   int *charbuf = coding->charbuf + coding->charbuf_used;
2894   int *charbuf_end
2895     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2896   int consumed_chars = 0, consumed_chars_base;
2897   int multibytep = coding->src_multibyte;
2898   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2899   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2900   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2901   int charset_id_2, charset_id_3;
2902   struct charset *charset;
2903   int c;
2904   /* For handling composition sequence.  */
2905 #define COMPOSING_NO                    0
2906 #define COMPOSING_CHAR                  1
2907 #define COMPOSING_RULE                  2
2908 #define COMPOSING_COMPONENT_CHAR        3
2909 #define COMPOSING_COMPONENT_RULE        4
2910
2911   int composition_state = COMPOSING_NO;
2912   enum composition_method method;
2913   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2914   int component_idx;
2915   int component_len;
2916   Lisp_Object attrs, charset_list;
2917   int char_offset = coding->produced_char;
2918   int last_offset = char_offset;
2919   int last_id = charset_ascii;
2920
2921   CODING_GET_INFO (coding, attrs, charset_list);
2922   setup_iso_safe_charsets (attrs);
2923
2924   while (1)
2925     {
2926       int c1, c2;
2927
2928       src_base = src;
2929       consumed_chars_base = consumed_chars;
2930
2931       if (charbuf >= charbuf_end)
2932         break;
2933
2934       ONE_MORE_BYTE (c1);
2935       if (c1 < 0)
2936         goto invalid_code;
2937
2938       /* We produce at most one character.  */
2939       switch (iso_code_class [c1])
2940         {
2941         case ISO_0x20_or_0x7F:
2942           if (composition_state != COMPOSING_NO)
2943             {
2944               if (composition_state == COMPOSING_RULE
2945                   || composition_state == COMPOSING_COMPONENT_RULE)
2946                 {
2947                   DECODE_COMPOSITION_RULE (c1);
2948                   components[component_idx++] = c1;
2949                   composition_state--;
2950                   continue;
2951                 }
2952             }
2953           if (charset_id_0 < 0
2954               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2955             /* This is SPACE or DEL.  */
2956             charset = CHARSET_FROM_ID (charset_ascii);
2957           else
2958             charset = CHARSET_FROM_ID (charset_id_0);
2959           break;
2960
2961         case ISO_graphic_plane_0:
2962           if (composition_state != COMPOSING_NO)
2963             {
2964               if (composition_state == COMPOSING_RULE
2965                   || composition_state == COMPOSING_COMPONENT_RULE)
2966                 {
2967                   DECODE_COMPOSITION_RULE (c1);
2968                   components[component_idx++] = c1;
2969                   composition_state--;
2970                   continue;
2971                 }
2972             }
2973           if (charset_id_0 < 0)
2974             charset = CHARSET_FROM_ID (charset_ascii);
2975           else
2976             charset = CHARSET_FROM_ID (charset_id_0);
2977           break;
2978
2979         case ISO_0xA0_or_0xFF:
2980           if (charset_id_1 < 0
2981               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2982               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2983             goto invalid_code;
2984           /* This is a graphic character, we fall down ... */
2985
2986         case ISO_graphic_plane_1:
2987           if (charset_id_1 < 0)
2988             goto invalid_code;
2989           charset = CHARSET_FROM_ID (charset_id_1);
2990           break;
2991
2992         case ISO_control_0:
2993           MAYBE_FINISH_COMPOSITION ();
2994           charset = CHARSET_FROM_ID (charset_ascii);
2995           break;
2996
2997         case ISO_control_1:
2998           MAYBE_FINISH_COMPOSITION ();
2999           goto invalid_code;
3000
3001         case ISO_shift_out:
3002           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3003               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3004             goto invalid_code;
3005           CODING_ISO_INVOCATION (coding, 0) = 1;
3006           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3007           continue;
3008
3009         case ISO_shift_in:
3010           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3011             goto invalid_code;
3012           CODING_ISO_INVOCATION (coding, 0) = 0;
3013           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3014           continue;
3015
3016         case ISO_single_shift_2_7:
3017         case ISO_single_shift_2:
3018           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3019             goto invalid_code;
3020           /* SS2 is handled as an escape sequence of ESC 'N' */
3021           c1 = 'N';
3022           goto label_escape_sequence;
3023
3024         case ISO_single_shift_3:
3025           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3026             goto invalid_code;
3027           /* SS2 is handled as an escape sequence of ESC 'O' */
3028           c1 = 'O';
3029           goto label_escape_sequence;
3030
3031         case ISO_control_sequence_introducer:
3032           /* CSI is handled as an escape sequence of ESC '[' ...  */
3033           c1 = '[';
3034           goto label_escape_sequence;
3035
3036         case ISO_escape:
3037           ONE_MORE_BYTE (c1);
3038         label_escape_sequence:
3039           /* Escape sequences handled here are invocation,
3040              designation, direction specification, and character
3041              composition specification.  */
3042           switch (c1)
3043             {
3044             case '&':           /* revision of following character set */
3045               ONE_MORE_BYTE (c1);
3046               if (!(c1 >= '@' && c1 <= '~'))
3047                 goto invalid_code;
3048               ONE_MORE_BYTE (c1);
3049               if (c1 != ISO_CODE_ESC)
3050                 goto invalid_code;
3051               ONE_MORE_BYTE (c1);
3052               goto label_escape_sequence;
3053
3054             case '$':           /* designation of 2-byte character set */
3055               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3056                 goto invalid_code;
3057               {
3058                 int reg, chars96;
3059
3060                 ONE_MORE_BYTE (c1);
3061                 if (c1 >= '@' && c1 <= 'B')
3062                   {     /* designation of JISX0208.1978, GB2312.1980,
3063                            or JISX0208.1980 */
3064                     reg = 0, chars96 = 0;
3065                   }
3066                 else if (c1 >= 0x28 && c1 <= 0x2B)
3067                   { /* designation of DIMENSION2_CHARS94 character set */
3068                     reg = c1 - 0x28, chars96 = 0;
3069                     ONE_MORE_BYTE (c1);
3070                   }
3071                 else if (c1 >= 0x2C && c1 <= 0x2F)
3072                   { /* designation of DIMENSION2_CHARS96 character set */
3073                     reg = c1 - 0x2C, chars96 = 1;
3074                     ONE_MORE_BYTE (c1);
3075                   }
3076                 else
3077                   goto invalid_code;
3078                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3079                 /* We must update these variables now.  */
3080                 if (reg == 0)
3081                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3082                 else if (reg == 1)
3083                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3084                 if (chars96 < 0)
3085                   goto invalid_code;
3086               }
3087               continue;
3088
3089             case 'n':           /* invocation of locking-shift-2 */
3090               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3091                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3092                 goto invalid_code;
3093               CODING_ISO_INVOCATION (coding, 0) = 2;
3094               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3095               continue;
3096
3097             case 'o':           /* invocation of locking-shift-3 */
3098               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3099                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3100                 goto invalid_code;
3101               CODING_ISO_INVOCATION (coding, 0) = 3;
3102               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3103               continue;
3104
3105             case 'N':           /* invocation of single-shift-2 */
3106               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3107                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3108                 goto invalid_code;
3109               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3110               if (charset_id_2 < 0)
3111                 charset = CHARSET_FROM_ID (charset_ascii);
3112               else
3113                 charset = CHARSET_FROM_ID (charset_id_2);
3114               ONE_MORE_BYTE (c1);
3115               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3116                 goto invalid_code;
3117               break;
3118
3119             case 'O':           /* invocation of single-shift-3 */
3120               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3121                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3122                 goto invalid_code;
3123               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3124               if (charset_id_3 < 0)
3125                 charset = CHARSET_FROM_ID (charset_ascii);
3126               else
3127                 charset = CHARSET_FROM_ID (charset_id_3);
3128               ONE_MORE_BYTE (c1);
3129               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3130                 goto invalid_code;
3131               break;
3132
3133             case '0': case '2': case '3': case '4': /* start composition */
3134               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3135                 goto invalid_code;
3136               DECODE_COMPOSITION_START (c1);
3137               continue;
3138
3139             case '1':           /* end composition */
3140               if (composition_state == COMPOSING_NO)
3141                 goto invalid_code;
3142               DECODE_COMPOSITION_END ();
3143               continue;
3144
3145             case '[':           /* specification of direction */
3146               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3147                 goto invalid_code;
3148               /* For the moment, nested direction is not supported.
3149                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3150                  left-to-right, and nozero means right-to-left.  */
3151               ONE_MORE_BYTE (c1);
3152               switch (c1)
3153                 {
3154                 case ']':       /* end of the current direction */
3155                   coding->mode &= ~CODING_MODE_DIRECTION;
3156
3157                 case '0':       /* end of the current direction */
3158                 case '1':       /* start of left-to-right direction */
3159                   ONE_MORE_BYTE (c1);
3160                   if (c1 == ']')
3161                     coding->mode &= ~CODING_MODE_DIRECTION;
3162                   else
3163                     goto invalid_code;
3164                   break;
3165
3166                 case '2':       /* start of right-to-left direction */
3167                   ONE_MORE_BYTE (c1);
3168                   if (c1 == ']')
3169                     coding->mode |= CODING_MODE_DIRECTION;
3170                   else
3171                     goto invalid_code;
3172                   break;
3173
3174                 default:
3175                   goto invalid_code;
3176                 }
3177               continue;
3178
3179             case '%':
3180               ONE_MORE_BYTE (c1);
3181               if (c1 == '/')
3182                 {
3183                   /* CTEXT extended segment:
3184                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3185                      We keep these bytes as is for the moment.
3186                      They may be decoded by post-read-conversion.  */
3187                   int dim, M, L;
3188                   int size;
3189
3190                   ONE_MORE_BYTE (dim);
3191                   ONE_MORE_BYTE (M);
3192                   ONE_MORE_BYTE (L);
3193                   size = ((M - 128) * 128) + (L - 128);
3194                   if (charbuf + 8 + size > charbuf_end)
3195                     goto break_loop;
3196                   *charbuf++ = ISO_CODE_ESC;
3197                   *charbuf++ = '%';
3198                   *charbuf++ = '/';
3199                   *charbuf++ = dim;
3200                   *charbuf++ = BYTE8_TO_CHAR (M);
3201                   *charbuf++ = BYTE8_TO_CHAR (L);
3202                   while (size-- > 0)
3203                     {
3204                       ONE_MORE_BYTE (c1);
3205                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3206                     }
3207                 }
3208               else if (c1 == 'G')
3209                 {
3210                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3211                      ESC % G --UTF-8-BYTES-- ESC % @
3212                      We keep these bytes as is for the moment.
3213                      They may be decoded by post-read-conversion.  */
3214                   int *p = charbuf;
3215
3216                   if (p + 6 > charbuf_end)
3217                     goto break_loop;
3218                   *p++ = ISO_CODE_ESC;
3219                   *p++ = '%';
3220                   *p++ = 'G';
3221                   while (p < charbuf_end)
3222                     {
3223                       ONE_MORE_BYTE (c1);
3224                       if (c1 == ISO_CODE_ESC
3225                           && src + 1 < src_end
3226                           && src[0] == '%'
3227                           && src[1] == '@')
3228                         {
3229                           src += 2;
3230                           break;
3231                         }
3232                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3233                     }
3234                   if (p + 3 > charbuf_end)
3235                     goto break_loop;
3236                   *p++ = ISO_CODE_ESC;
3237                   *p++ = '%';
3238                   *p++ = '@';
3239                   charbuf = p;
3240                 }
3241               else
3242                 goto invalid_code;
3243               continue;
3244               break;
3245
3246             default:
3247               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3248                 goto invalid_code;
3249               {
3250                 int reg, chars96;
3251
3252                 if (c1 >= 0x28 && c1 <= 0x2B)
3253                   { /* designation of DIMENSION1_CHARS94 character set */
3254                     reg = c1 - 0x28, chars96 = 0;
3255                     ONE_MORE_BYTE (c1);
3256                   }
3257                 else if (c1 >= 0x2C && c1 <= 0x2F)
3258                   { /* designation of DIMENSION1_CHARS96 character set */
3259                     reg = c1 - 0x2C, chars96 = 1;
3260                     ONE_MORE_BYTE (c1);
3261                   }
3262                 else
3263                   goto invalid_code;
3264                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3265                 /* We must update these variables now.  */
3266                 if (reg == 0)
3267                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3268                 else if (reg == 1)
3269                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3270                 if (chars96 < 0)
3271                   goto invalid_code;
3272               }
3273               continue;
3274             }
3275         }
3276
3277       if (charset->id != charset_ascii
3278           && last_id != charset->id)
3279         {
3280           if (last_id != charset_ascii)
3281             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3282           last_id = charset->id;
3283           last_offset = char_offset;
3284         }
3285
3286       /* Now we know CHARSET and 1st position code C1 of a character.
3287          Produce a decoded character while getting 2nd position code
3288          C2 if necessary.  */
3289       c1 &= 0x7F;
3290       if (CHARSET_DIMENSION (charset) > 1)
3291         {
3292           ONE_MORE_BYTE (c2);
3293           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3294             /* C2 is not in a valid range.  */
3295             goto invalid_code;
3296           c1 = (c1 << 8) | (c2 & 0x7F);
3297           if (CHARSET_DIMENSION (charset) > 2)
3298             {
3299               ONE_MORE_BYTE (c2);
3300               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3301                 /* C2 is not in a valid range.  */
3302                 goto invalid_code;
3303               c1 = (c1 << 8) | (c2 & 0x7F);
3304             }
3305         }
3306
3307       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3308       if (c < 0)
3309         {
3310           MAYBE_FINISH_COMPOSITION ();
3311           for (; src_base < src; src_base++, char_offset++)
3312             {
3313               if (ASCII_BYTE_P (*src_base))
3314                 *charbuf++ = *src_base;
3315               else
3316                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3317             }
3318         }
3319       else if (composition_state == COMPOSING_NO)
3320         {
3321           *charbuf++ = c;
3322           char_offset++;
3323         }
3324       else
3325         {
3326           components[component_idx++] = c;
3327           if (method == COMPOSITION_WITH_RULE
3328               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3329                   && composition_state == COMPOSING_COMPONENT_CHAR))
3330             composition_state++;
3331         }
3332       continue;
3333
3334     invalid_code:
3335       MAYBE_FINISH_COMPOSITION ();
3336       src = src_base;
3337       consumed_chars = consumed_chars_base;
3338       ONE_MORE_BYTE (c);
3339       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3340       char_offset++;
3341       coding->errors++;
3342       continue;
3343
3344     break_loop:
3345       break;
3346     }
3347
3348  no_more_source:
3349   if (last_id != charset_ascii)
3350     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3351   coding->consumed_char += consumed_chars_base;
3352   coding->consumed = src_base - coding->source;
3353   coding->charbuf_used = charbuf - coding->charbuf;
3354 }
3355
3356
3357 /* ISO2022 encoding stuff.  */
3358
3359 /*
3360    It is not enough to say just "ISO2022" on encoding, we have to
3361    specify more details.  In Emacs, each coding system of ISO2022
3362    variant has the following specifications:
3363         1. Initial designation to G0 thru G3.
3364         2. Allows short-form designation?
3365         3. ASCII should be designated to G0 before control characters?
3366         4. ASCII should be designated to G0 at end of line?
3367         5. 7-bit environment or 8-bit environment?
3368         6. Use locking-shift?
3369         7. Use Single-shift?
3370    And the following two are only for Japanese:
3371         8. Use ASCII in place of JIS0201-1976-Roman?
3372         9. Use JISX0208-1983 in place of JISX0208-1978?
3373    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3374    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3375    details.
3376 */
3377
3378 /* Produce codes (escape sequence) for designating CHARSET to graphic
3379    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3380    '@', 'A', or 'B' and the coding system CODING allows, produce
3381    designation sequence of short-form.  */
3382
3383 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3384   do {                                                                  \
3385     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3386     char *intermediate_char_94 = "()*+";                                \
3387     char *intermediate_char_96 = ",-./";                                \
3388     int revision = -1;                                                  \
3389     int c;                                                              \
3390                                                                         \
3391     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3392       revision = CHARSET_ISO_REVISION (charset);                        \
3393                                                                         \
3394     if (revision >= 0)                                                  \
3395       {                                                                 \
3396         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3397         EMIT_ONE_BYTE ('@' + revision);                                 \
3398       }                                                                 \
3399     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3400     if (CHARSET_DIMENSION (charset) == 1)                               \
3401       {                                                                 \
3402         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3403           c = intermediate_char_94[reg];                                \
3404         else                                                            \
3405           c = intermediate_char_96[reg];                                \
3406         EMIT_ONE_ASCII_BYTE (c);                                        \
3407       }                                                                 \
3408     else                                                                \
3409       {                                                                 \
3410         EMIT_ONE_ASCII_BYTE ('$');                                      \
3411         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3412           {                                                             \
3413             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3414                 || reg != 0                                             \
3415                 || final_char < '@' || final_char > 'B')                \
3416               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3417           }                                                             \
3418         else                                                            \
3419           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3420       }                                                                 \
3421     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3422                                                                         \
3423     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3424   } while (0)
3425
3426
3427 /* The following two macros produce codes (control character or escape
3428    sequence) for ISO2022 single-shift functions (single-shift-2 and
3429    single-shift-3).  */
3430
3431 #define ENCODE_SINGLE_SHIFT_2                                           \
3432   do {                                                                  \
3433     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3434       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3435     else                                                                \
3436       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3437     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3438   } while (0)
3439
3440
3441 #define ENCODE_SINGLE_SHIFT_3                                           \
3442   do {                                                                  \
3443     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3444       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3445     else                                                                \
3446       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3447     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3448   } while (0)
3449
3450
3451 /* The following four macros produce codes (control character or
3452    escape sequence) for ISO2022 locking-shift functions (shift-in,
3453    shift-out, locking-shift-2, and locking-shift-3).  */
3454
3455 #define ENCODE_SHIFT_IN                                 \
3456   do {                                                  \
3457     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3458     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3459   } while (0)
3460
3461
3462 #define ENCODE_SHIFT_OUT                                \
3463   do {                                                  \
3464     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3465     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3466   } while (0)
3467
3468
3469 #define ENCODE_LOCKING_SHIFT_2                          \
3470   do {                                                  \
3471     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3472     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3473   } while (0)
3474
3475
3476 #define ENCODE_LOCKING_SHIFT_3                          \
3477   do {                                                  \
3478     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3479     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3480   } while (0)
3481
3482
3483 /* Produce codes for a DIMENSION1 character whose character set is
3484    CHARSET and whose position-code is C1.  Designation and invocation
3485    sequences are also produced in advance if necessary.  */
3486
3487 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3488   do {                                                                  \
3489     int id = CHARSET_ID (charset);                                      \
3490                                                                         \
3491     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3492         && id == charset_ascii)                                         \
3493       {                                                                 \
3494         id = charset_jisx0201_roman;                                    \
3495         charset = CHARSET_FROM_ID (id);                                 \
3496       }                                                                 \
3497                                                                         \
3498     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3499       {                                                                 \
3500         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3501           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3502         else                                                            \
3503           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3504         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3505         break;                                                          \
3506       }                                                                 \
3507     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3508       {                                                                 \
3509         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3510         break;                                                          \
3511       }                                                                 \
3512     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3513       {                                                                 \
3514         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3515         break;                                                          \
3516       }                                                                 \
3517     else                                                                \
3518       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3519          must invoke it, or, at first, designate it to some graphic     \
3520          register.  Then repeat the loop to actually produce the        \
3521          character.  */                                                 \
3522       dst = encode_invocation_designation (charset, coding, dst,        \
3523                                            &produced_chars);            \
3524   } while (1)
3525
3526
3527 /* Produce codes for a DIMENSION2 character whose character set is
3528    CHARSET and whose position-codes are C1 and C2.  Designation and
3529    invocation codes are also produced in advance if necessary.  */
3530
3531 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3532   do {                                                                  \
3533     int id = CHARSET_ID (charset);                                      \
3534                                                                         \
3535     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3536         && id == charset_jisx0208)                                      \
3537       {                                                                 \
3538         id = charset_jisx0208_1978;                                     \
3539         charset = CHARSET_FROM_ID (id);                                 \
3540       }                                                                 \
3541                                                                         \
3542     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3543       {                                                                 \
3544         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3545           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3546         else                                                            \
3547           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3548         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3549         break;                                                          \
3550       }                                                                 \
3551     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3552       {                                                                 \
3553         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3554         break;                                                          \
3555       }                                                                 \
3556     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3557       {                                                                 \
3558         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3559         break;                                                          \
3560       }                                                                 \
3561     else                                                                \
3562       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3563          must invoke it, or, at first, designate it to some graphic     \
3564          register.  Then repeat the loop to actually produce the        \
3565          character.  */                                                 \
3566       dst = encode_invocation_designation (charset, coding, dst,        \
3567                                            &produced_chars);            \
3568   } while (1)
3569
3570
3571 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3572   do {                                                                     \
3573     int code = ENCODE_CHAR ((charset),(c));                                \
3574                                                                            \
3575     if (CHARSET_DIMENSION (charset) == 1)                                  \
3576       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3577     else                                                                   \
3578       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3579   } while (0)
3580
3581
3582 /* Produce designation and invocation codes at a place pointed by DST
3583    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3584    Return new DST.  */
3585
3586 unsigned char *
3587 encode_invocation_designation (charset, coding, dst, p_nchars)
3588      struct charset *charset;
3589      struct coding_system *coding;
3590      unsigned char *dst;
3591      int *p_nchars;
3592 {
3593   int multibytep = coding->dst_multibyte;
3594   int produced_chars = *p_nchars;
3595   int reg;                      /* graphic register number */
3596   int id = CHARSET_ID (charset);
3597
3598   /* At first, check designations.  */
3599   for (reg = 0; reg < 4; reg++)
3600     if (id == CODING_ISO_DESIGNATION (coding, reg))
3601       break;
3602
3603   if (reg >= 4)
3604     {
3605       /* CHARSET is not yet designated to any graphic registers.  */
3606       /* At first check the requested designation.  */
3607       reg = CODING_ISO_REQUEST (coding, id);
3608       if (reg < 0)
3609         /* Since CHARSET requests no special designation, designate it
3610            to graphic register 0.  */
3611         reg = 0;
3612
3613       ENCODE_DESIGNATION (charset, reg, coding);
3614     }
3615
3616   if (CODING_ISO_INVOCATION (coding, 0) != reg
3617       && CODING_ISO_INVOCATION (coding, 1) != reg)
3618     {
3619       /* Since the graphic register REG is not invoked to any graphic
3620          planes, invoke it to graphic plane 0.  */
3621       switch (reg)
3622         {
3623         case 0:                 /* graphic register 0 */
3624           ENCODE_SHIFT_IN;
3625           break;
3626
3627         case 1:                 /* graphic register 1 */
3628           ENCODE_SHIFT_OUT;
3629           break;
3630
3631         case 2:                 /* graphic register 2 */
3632           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3633             ENCODE_SINGLE_SHIFT_2;
3634           else
3635             ENCODE_LOCKING_SHIFT_2;
3636           break;
3637
3638         case 3:                 /* graphic register 3 */
3639           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3640             ENCODE_SINGLE_SHIFT_3;
3641           else
3642             ENCODE_LOCKING_SHIFT_3;
3643           break;
3644         }
3645     }
3646
3647   *p_nchars = produced_chars;
3648   return dst;
3649 }
3650
3651 /* The following three macros produce codes for indicating direction
3652    of text.  */
3653 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3654   do {                                                                  \
3655     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3656       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3657     else                                                                \
3658       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3659   } while (0)
3660
3661
3662 #define ENCODE_DIRECTION_R2L()                  \
3663   do {                                          \
3664     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3665     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3666   } while (0)
3667
3668
3669 #define ENCODE_DIRECTION_L2R()                  \
3670   do {                                          \
3671     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3672     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3673   } while (0)
3674
3675
3676 /* Produce codes for designation and invocation to reset the graphic
3677    planes and registers to initial state.  */
3678 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3679   do {                                                                  \
3680     int reg;                                                            \
3681     struct charset *charset;                                            \
3682                                                                         \
3683     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3684       ENCODE_SHIFT_IN;                                                  \
3685     for (reg = 0; reg < 4; reg++)                                       \
3686       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3687           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3688               != CODING_ISO_INITIAL (coding, reg)))                     \
3689         {                                                               \
3690           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3691           ENCODE_DESIGNATION (charset, reg, coding);                    \
3692         }                                                               \
3693   } while (0)
3694
3695
3696 /* Produce designation sequences of charsets in the line started from
3697    SRC to a place pointed by DST, and return updated DST.
3698
3699    If the current block ends before any end-of-line, we may fail to
3700    find all the necessary designations.  */
3701
3702 static unsigned char *
3703 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3704      struct coding_system *coding;
3705      int *charbuf, *charbuf_end;
3706      unsigned char *dst;
3707 {
3708   struct charset *charset;
3709   /* Table of charsets to be designated to each graphic register.  */
3710   int r[4];
3711   int c, found = 0, reg;
3712   int produced_chars = 0;
3713   int multibytep = coding->dst_multibyte;
3714   Lisp_Object attrs;
3715   Lisp_Object charset_list;
3716
3717   attrs = CODING_ID_ATTRS (coding->id);
3718   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3719   if (EQ (charset_list, Qiso_2022))
3720     charset_list = Viso_2022_charset_list;
3721
3722   for (reg = 0; reg < 4; reg++)
3723     r[reg] = -1;
3724
3725   while (found < 4)
3726     {
3727       int id;
3728
3729       c = *charbuf++;
3730       if (c == '\n')
3731         break;
3732       charset = char_charset (c, charset_list, NULL);
3733       id = CHARSET_ID (charset);
3734       reg = CODING_ISO_REQUEST (coding, id);
3735       if (reg >= 0 && r[reg] < 0)
3736         {
3737           found++;
3738           r[reg] = id;
3739         }
3740     }
3741
3742   if (found)
3743     {
3744       for (reg = 0; reg < 4; reg++)
3745         if (r[reg] >= 0
3746             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3747           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3748     }
3749
3750   return dst;
3751 }
3752
3753 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3754
3755 static int
3756 encode_coding_iso_2022 (coding)
3757      struct coding_system *coding;
3758 {
3759   int multibytep = coding->dst_multibyte;
3760   int *charbuf = coding->charbuf;
3761   int *charbuf_end = charbuf + coding->charbuf_used;
3762   unsigned char *dst = coding->destination + coding->produced;
3763   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3764   int safe_room = 16;
3765   int bol_designation
3766     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3767        && CODING_ISO_BOL (coding));
3768   int produced_chars = 0;
3769   Lisp_Object attrs, eol_type, charset_list;
3770   int ascii_compatible;
3771   int c;
3772   int preferred_charset_id = -1;
3773
3774   CODING_GET_INFO (coding, attrs, charset_list);
3775   eol_type = CODING_ID_EOL_TYPE (coding->id);
3776   if (VECTORP (eol_type))
3777     eol_type = Qunix;
3778
3779   setup_iso_safe_charsets (attrs);
3780   /* Charset list may have been changed.  */
3781   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3782   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3783
3784   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3785
3786   while (charbuf < charbuf_end)
3787     {
3788       ASSURE_DESTINATION (safe_room);
3789
3790       if (bol_designation)
3791         {
3792           unsigned char *dst_prev = dst;
3793
3794           /* We have to produce designation sequences if any now.  */
3795           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3796           bol_designation = 0;
3797           /* We are sure that designation sequences are all ASCII bytes.  */
3798           produced_chars += dst - dst_prev;
3799         }
3800
3801       c = *charbuf++;
3802
3803       if (c < 0)
3804         {
3805           /* Handle an annotation.  */
3806           switch (*charbuf)
3807             {
3808             case CODING_ANNOTATE_COMPOSITION_MASK:
3809               /* Not yet implemented.  */
3810               break;
3811             case CODING_ANNOTATE_CHARSET_MASK:
3812               preferred_charset_id = charbuf[2];
3813               if (preferred_charset_id >= 0
3814                   && NILP (Fmemq (make_number (preferred_charset_id),
3815                                   charset_list)))
3816                 preferred_charset_id = -1;
3817               break;
3818             default:
3819               abort ();
3820             }
3821           charbuf += -c - 1;
3822           continue;
3823         }
3824
3825       /* Now encode the character C.  */
3826       if (c < 0x20 || c == 0x7F)
3827         {
3828           if (c == '\n'
3829               || (c == '\r' && EQ (eol_type, Qmac)))
3830             {
3831               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3832                 ENCODE_RESET_PLANE_AND_REGISTER ();
3833               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3834                 {
3835                   int i;
3836
3837                   for (i = 0; i < 4; i++)
3838                     CODING_ISO_DESIGNATION (coding, i)
3839                       = CODING_ISO_INITIAL (coding, i);
3840                 }
3841               bol_designation
3842                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3843             }
3844           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3845             ENCODE_RESET_PLANE_AND_REGISTER ();
3846           EMIT_ONE_ASCII_BYTE (c);
3847         }
3848       else if (ASCII_CHAR_P (c))
3849         {
3850           if (ascii_compatible)
3851             EMIT_ONE_ASCII_BYTE (c);
3852           else
3853             {
3854               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3855               ENCODE_ISO_CHARACTER (charset, c);
3856             }
3857         }
3858       else if (CHAR_BYTE8_P (c))
3859         {
3860           c = CHAR_TO_BYTE8 (c);
3861           EMIT_ONE_BYTE (c);
3862         }
3863       else
3864         {
3865           struct charset *charset;
3866
3867           if (preferred_charset_id >= 0)
3868             {
3869               charset = CHARSET_FROM_ID (preferred_charset_id);
3870               if (! CHAR_CHARSET_P (c, charset))
3871                 charset = char_charset (c, charset_list, NULL);
3872             }
3873           else
3874             charset = char_charset (c, charset_list, NULL);
3875           if (!charset)
3876             {
3877               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3878                 {
3879                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3880                   charset = CHARSET_FROM_ID (charset_ascii);
3881                 }
3882               else
3883                 {
3884                   c = coding->default_char;
3885                   charset = char_charset (c, charset_list, NULL);
3886                 }
3887             }
3888           ENCODE_ISO_CHARACTER (charset, c);
3889         }
3890     }
3891
3892   if (coding->mode & CODING_MODE_LAST_BLOCK
3893       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3894     {
3895       ASSURE_DESTINATION (safe_room);
3896       ENCODE_RESET_PLANE_AND_REGISTER ();
3897     }
3898   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3899   CODING_ISO_BOL (coding) = bol_designation;
3900   coding->produced_char += produced_chars;
3901   coding->produced = dst - coding->destination;
3902   return 0;
3903 }
3904
3905 \f
3906 /*** 8,9. SJIS and BIG5 handlers ***/
3907
3908 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3909    quite widely.  So, for the moment, Emacs supports them in the bare
3910    C code.  But, in the future, they may be supported only by CCL.  */
3911
3912 /* SJIS is a coding system encoding three character sets: ASCII, right
3913    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3914    as is.  A character of charset katakana-jisx0201 is encoded by
3915    "position-code + 0x80".  A character of charset japanese-jisx0208
3916    is encoded in 2-byte but two position-codes are divided and shifted
3917    so that it fit in the range below.
3918
3919    --- CODE RANGE of SJIS ---
3920    (character set)      (range)
3921    ASCII                0x00 .. 0x7F
3922    KATAKANA-JISX0201    0xA0 .. 0xDF
3923    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3924             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3925    -------------------------------
3926
3927 */
3928
3929 /* BIG5 is a coding system encoding two character sets: ASCII and
3930    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3931    character set and is encoded in two-byte.
3932
3933    --- CODE RANGE of BIG5 ---
3934    (character set)      (range)
3935    ASCII                0x00 .. 0x7F
3936    Big5 (1st byte)      0xA1 .. 0xFE
3937         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3938    --------------------------
3939
3940   */
3941
3942 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3943    Check if a text is encoded in SJIS.  If it is, return
3944    CATEGORY_MASK_SJIS, else return 0.  */
3945
3946 static int
3947 detect_coding_sjis (coding, detect_info)
3948      struct coding_system *coding;
3949      struct coding_detection_info *detect_info;
3950 {
3951   const unsigned char *src = coding->source, *src_base;
3952   const unsigned char *src_end = coding->source + coding->src_bytes;
3953   int multibytep = coding->src_multibyte;
3954   int consumed_chars = 0;
3955   int found = 0;
3956   int c;
3957
3958   detect_info->checked |= CATEGORY_MASK_SJIS;
3959   /* A coding system of this category is always ASCII compatible.  */
3960   src += coding->head_ascii;
3961
3962   while (1)
3963     {
3964       src_base = src;
3965       ONE_MORE_BYTE (c);
3966       if (c < 0x80)
3967         continue;
3968       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3969         {
3970           ONE_MORE_BYTE (c);
3971           if (c < 0x40 || c == 0x7F || c > 0xFC)
3972             break;
3973           found = CATEGORY_MASK_SJIS;
3974         }
3975       else if (c >= 0xA0 && c < 0xE0)
3976         found = CATEGORY_MASK_SJIS;
3977       else
3978         break;
3979     }
3980   detect_info->rejected |= CATEGORY_MASK_SJIS;
3981   return 0;
3982
3983  no_more_source:
3984   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3985     {
3986       detect_info->rejected |= CATEGORY_MASK_SJIS;
3987       return 0;
3988     }
3989   detect_info->found |= found;
3990   return 1;
3991 }
3992
3993 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3994    Check if a text is encoded in BIG5.  If it is, return
3995    CATEGORY_MASK_BIG5, else return 0.  */
3996
3997 static int
3998 detect_coding_big5 (coding, detect_info)
3999      struct coding_system *coding;
4000      struct coding_detection_info *detect_info;
4001 {
4002   const unsigned char *src = coding->source, *src_base;
4003   const unsigned char *src_end = coding->source + coding->src_bytes;
4004   int multibytep = coding->src_multibyte;
4005   int consumed_chars = 0;
4006   int found = 0;
4007   int c;
4008
4009   detect_info->checked |= CATEGORY_MASK_BIG5;
4010   /* A coding system of this category is always ASCII compatible.  */
4011   src += coding->head_ascii;
4012
4013   while (1)
4014     {
4015       src_base = src;
4016       ONE_MORE_BYTE (c);
4017       if (c < 0x80)
4018         continue;
4019       if (c >= 0xA1)
4020         {
4021           ONE_MORE_BYTE (c);
4022           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4023             return 0;
4024           found = CATEGORY_MASK_BIG5;
4025         }
4026       else
4027         break;
4028     }
4029   detect_info->rejected |= CATEGORY_MASK_BIG5;
4030   return 0;
4031
4032  no_more_source:
4033   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4034     {
4035       detect_info->rejected |= CATEGORY_MASK_BIG5;
4036       return 0;
4037     }
4038   detect_info->found |= found;
4039   return 1;
4040 }
4041
4042 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4043    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4044
4045 static void
4046 decode_coding_sjis (coding)
4047      struct coding_system *coding;
4048 {
4049   const unsigned char *src = coding->source + coding->consumed;
4050   const unsigned char *src_end = coding->source + coding->src_bytes;
4051   const unsigned char *src_base;
4052   int *charbuf = coding->charbuf + coding->charbuf_used;
4053   int *charbuf_end
4054     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4055   int consumed_chars = 0, consumed_chars_base;
4056   int multibytep = coding->src_multibyte;
4057   struct charset *charset_roman, *charset_kanji, *charset_kana;
4058   struct charset *charset_kanji2;
4059   Lisp_Object attrs, charset_list, val;
4060   int char_offset = coding->produced_char;
4061   int last_offset = char_offset;
4062   int last_id = charset_ascii;
4063
4064   CODING_GET_INFO (coding, attrs, charset_list);
4065
4066   val = charset_list;
4067   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4068   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4069   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4070   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4071
4072   while (1)
4073     {
4074       int c, c1;
4075       struct charset *charset;
4076
4077       src_base = src;
4078       consumed_chars_base = consumed_chars;
4079
4080       if (charbuf >= charbuf_end)
4081         break;
4082
4083       ONE_MORE_BYTE (c);
4084       if (c < 0)
4085         goto invalid_code;
4086       if (c < 0x80)
4087         charset = charset_roman;
4088       else if (c == 0x80 || c == 0xA0)
4089         goto invalid_code;
4090       else if (c >= 0xA1 && c <= 0xDF)
4091         {
4092           /* SJIS -> JISX0201-Kana */
4093           c &= 0x7F;
4094           charset = charset_kana;
4095         }
4096       else if (c <= 0xEF)
4097         {
4098           /* SJIS -> JISX0208 */
4099           ONE_MORE_BYTE (c1);
4100           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4101             goto invalid_code;
4102           c = (c << 8) | c1;
4103           SJIS_TO_JIS (c);
4104           charset = charset_kanji;
4105         }
4106       else if (c <= 0xFC && charset_kanji2)
4107         {
4108           /* SJIS -> JISX0213-2 */
4109           ONE_MORE_BYTE (c1);
4110           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4111             goto invalid_code;
4112           c = (c << 8) | c1;
4113           SJIS_TO_JIS2 (c);
4114           charset = charset_kanji2;
4115         }
4116       else
4117         goto invalid_code;
4118       if (charset->id != charset_ascii
4119           && last_id != charset->id)
4120         {
4121           if (last_id != charset_ascii)
4122             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4123           last_id = charset->id;
4124           last_offset = char_offset;
4125         }
4126       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4127       *charbuf++ = c;
4128       char_offset++;
4129       continue;
4130
4131     invalid_code:
4132       src = src_base;
4133       consumed_chars = consumed_chars_base;
4134       ONE_MORE_BYTE (c);
4135       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4136       char_offset++;
4137       coding->errors++;
4138     }
4139
4140  no_more_source:
4141   if (last_id != charset_ascii)
4142     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4143   coding->consumed_char += consumed_chars_base;
4144   coding->consumed = src_base - coding->source;
4145   coding->charbuf_used = charbuf - coding->charbuf;
4146 }
4147
4148 static void
4149 decode_coding_big5 (coding)
4150      struct coding_system *coding;
4151 {
4152   const unsigned char *src = coding->source + coding->consumed;
4153   const unsigned char *src_end = coding->source + coding->src_bytes;
4154   const unsigned char *src_base;
4155   int *charbuf = coding->charbuf + coding->charbuf_used;
4156   int *charbuf_end
4157     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4158   int consumed_chars = 0, consumed_chars_base;
4159   int multibytep = coding->src_multibyte;
4160   struct charset *charset_roman, *charset_big5;
4161   Lisp_Object attrs, charset_list, val;
4162   int char_offset = coding->produced_char;
4163   int last_offset = char_offset;
4164   int last_id = charset_ascii;
4165
4166   CODING_GET_INFO (coding, attrs, charset_list);
4167   val = charset_list;
4168   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4169   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4170
4171   while (1)
4172     {
4173       int c, c1;
4174       struct charset *charset;
4175
4176       src_base = src;
4177       consumed_chars_base = consumed_chars;
4178
4179       if (charbuf >= charbuf_end)
4180         break;
4181
4182       ONE_MORE_BYTE (c);
4183
4184       if (c < 0)
4185         goto invalid_code;
4186       if (c < 0x80)
4187         charset = charset_roman;
4188       else
4189         {
4190           /* BIG5 -> Big5 */
4191           if (c < 0xA1 || c > 0xFE)
4192             goto invalid_code;
4193           ONE_MORE_BYTE (c1);
4194           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4195             goto invalid_code;
4196           c = c << 8 | c1;
4197           charset = charset_big5;
4198         }
4199       if (charset->id != charset_ascii
4200           && last_id != charset->id)
4201         {
4202           if (last_id != charset_ascii)
4203             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4204           last_id = charset->id;
4205           last_offset = char_offset;
4206         }
4207       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4208       *charbuf++ = c;
4209       char_offset++;
4210       continue;
4211
4212     invalid_code:
4213       src = src_base;
4214       consumed_chars = consumed_chars_base;
4215       ONE_MORE_BYTE (c);
4216       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4217       char_offset++;
4218       coding->errors++;
4219     }
4220
4221  no_more_source:
4222   if (last_id != charset_ascii)
4223     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4224   coding->consumed_char += consumed_chars_base;
4225   coding->consumed = src_base - coding->source;
4226   coding->charbuf_used = charbuf - coding->charbuf;
4227 }
4228
4229 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4230    This function can encode charsets `ascii', `katakana-jisx0201',
4231    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4232    are sure that all these charsets are registered as official charset
4233    (i.e. do not have extended leading-codes).  Characters of other
4234    charsets are produced without any encoding.  If SJIS_P is 1, encode
4235    SJIS text, else encode BIG5 text.  */
4236
4237 static int
4238 encode_coding_sjis (coding)
4239      struct coding_system *coding;
4240 {
4241   int multibytep = coding->dst_multibyte;
4242   int *charbuf = coding->charbuf;
4243   int *charbuf_end = charbuf + coding->charbuf_used;
4244   unsigned char *dst = coding->destination + coding->produced;
4245   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4246   int safe_room = 4;
4247   int produced_chars = 0;
4248   Lisp_Object attrs, charset_list, val;
4249   int ascii_compatible;
4250   struct charset *charset_roman, *charset_kanji, *charset_kana;
4251   struct charset *charset_kanji2;
4252   int c;
4253
4254   CODING_GET_INFO (coding, attrs, charset_list);
4255   val = charset_list;
4256   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4257   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4258   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4259   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4260
4261   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4262
4263   while (charbuf < charbuf_end)
4264     {
4265       ASSURE_DESTINATION (safe_room);
4266       c = *charbuf++;
4267       /* Now encode the character C.  */
4268       if (ASCII_CHAR_P (c) && ascii_compatible)
4269         EMIT_ONE_ASCII_BYTE (c);
4270       else if (CHAR_BYTE8_P (c))
4271         {
4272           c = CHAR_TO_BYTE8 (c);
4273           EMIT_ONE_BYTE (c);
4274         }
4275       else
4276         {
4277           unsigned code;
4278           struct charset *charset = char_charset (c, charset_list, &code);
4279
4280           if (!charset)
4281             {
4282               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4283                 {
4284                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4285                   charset = CHARSET_FROM_ID (charset_ascii);
4286                 }
4287               else
4288                 {
4289                   c = coding->default_char;
4290                   charset = char_charset (c, charset_list, &code);
4291                 }
4292             }
4293           if (code == CHARSET_INVALID_CODE (charset))
4294             abort ();
4295           if (charset == charset_kanji)
4296             {
4297               int c1, c2;
4298               JIS_TO_SJIS (code);
4299               c1 = code >> 8, c2 = code & 0xFF;
4300               EMIT_TWO_BYTES (c1, c2);
4301             }
4302           else if (charset == charset_kana)
4303             EMIT_ONE_BYTE (code | 0x80);
4304           else if (charset_kanji2 && charset == charset_kanji2)
4305             {
4306               int c1, c2;
4307
4308               c1 = code >> 8;
4309               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4310                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4311                 {
4312                   JIS_TO_SJIS2 (code);
4313                   c1 = code >> 8, c2 = code & 0xFF;
4314                   EMIT_TWO_BYTES (c1, c2);
4315                 }
4316               else
4317                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4318             }
4319           else
4320             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4321         }
4322     }
4323   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4324   coding->produced_char += produced_chars;
4325   coding->produced = dst - coding->destination;
4326   return 0;
4327 }
4328
4329 static int
4330 encode_coding_big5 (coding)
4331      struct coding_system *coding;
4332 {
4333   int multibytep = coding->dst_multibyte;
4334   int *charbuf = coding->charbuf;
4335   int *charbuf_end = charbuf + coding->charbuf_used;
4336   unsigned char *dst = coding->destination + coding->produced;
4337   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4338   int safe_room = 4;
4339   int produced_chars = 0;
4340   Lisp_Object attrs, charset_list, val;
4341   int ascii_compatible;
4342   struct charset *charset_roman, *charset_big5;
4343   int c;
4344
4345   CODING_GET_INFO (coding, attrs, charset_list);
4346   val = charset_list;
4347   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4348   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4349   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4350
4351   while (charbuf < charbuf_end)
4352     {
4353       ASSURE_DESTINATION (safe_room);
4354       c = *charbuf++;
4355       /* Now encode the character C.  */
4356       if (ASCII_CHAR_P (c) && ascii_compatible)
4357         EMIT_ONE_ASCII_BYTE (c);
4358       else if (CHAR_BYTE8_P (c))
4359         {
4360           c = CHAR_TO_BYTE8 (c);
4361           EMIT_ONE_BYTE (c);
4362         }
4363       else
4364         {
4365           unsigned code;
4366           struct charset *charset = char_charset (c, charset_list, &code);
4367
4368           if (! charset)
4369             {
4370               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4371                 {
4372                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4373                   charset = CHARSET_FROM_ID (charset_ascii);
4374                 }
4375               else
4376                 {
4377                   c = coding->default_char;
4378                   charset = char_charset (c, charset_list, &code);
4379                 }
4380             }
4381           if (code == CHARSET_INVALID_CODE (charset))
4382             abort ();
4383           if (charset == charset_big5)
4384             {
4385               int c1, c2;
4386
4387               c1 = code >> 8, c2 = code & 0xFF;
4388               EMIT_TWO_BYTES (c1, c2);
4389             }
4390           else
4391             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4392         }
4393     }
4394   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4395   coding->produced_char += produced_chars;
4396   coding->produced = dst - coding->destination;
4397   return 0;
4398 }
4399
4400 \f
4401 /*** 10. CCL handlers ***/
4402
4403 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4404    Check if a text is encoded in a coding system of which
4405    encoder/decoder are written in CCL program.  If it is, return
4406    CATEGORY_MASK_CCL, else return 0.  */
4407
4408 static int
4409 detect_coding_ccl (coding, detect_info)
4410      struct coding_system *coding;
4411      struct coding_detection_info *detect_info;
4412 {
4413   const unsigned char *src = coding->source, *src_base;
4414   const unsigned char *src_end = coding->source + coding->src_bytes;
4415   int multibytep = coding->src_multibyte;
4416   int consumed_chars = 0;
4417   int found = 0;
4418   unsigned char *valids;
4419   int head_ascii = coding->head_ascii;
4420   Lisp_Object attrs;
4421
4422   detect_info->checked |= CATEGORY_MASK_CCL;
4423
4424   coding = &coding_categories[coding_category_ccl];
4425   valids = CODING_CCL_VALIDS (coding);
4426   attrs = CODING_ID_ATTRS (coding->id);
4427   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4428     src += head_ascii;
4429
4430   while (1)
4431     {
4432       int c;
4433
4434       src_base = src;
4435       ONE_MORE_BYTE (c);
4436       if (c < 0 || ! valids[c])
4437         break;
4438       if ((valids[c] > 1))
4439         found = CATEGORY_MASK_CCL;
4440     }
4441   detect_info->rejected |= CATEGORY_MASK_CCL;
4442   return 0;
4443
4444  no_more_source:
4445   detect_info->found |= found;
4446   return 1;
4447 }
4448
4449 static void
4450 decode_coding_ccl (coding)
4451      struct coding_system *coding;
4452 {
4453   const unsigned char *src = coding->source + coding->consumed;
4454   const unsigned char *src_end = coding->source + coding->src_bytes;
4455   int *charbuf = coding->charbuf + coding->charbuf_used;
4456   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4457   int consumed_chars = 0;
4458   int multibytep = coding->src_multibyte;
4459   struct ccl_program ccl;
4460   int source_charbuf[1024];
4461   int source_byteidx[1024];
4462   Lisp_Object attrs, charset_list;
4463
4464   CODING_GET_INFO (coding, attrs, charset_list);
4465   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4466
4467   while (src < src_end)
4468     {
4469       const unsigned char *p = src;
4470       int *source, *source_end;
4471       int i = 0;
4472
4473       if (multibytep)
4474         while (i < 1024 && p < src_end)
4475           {
4476             source_byteidx[i] = p - src;
4477             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4478           }
4479       else
4480         while (i < 1024 && p < src_end)
4481           source_charbuf[i++] = *p++;
4482
4483       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4484         ccl.last_block = 1;
4485
4486       source = source_charbuf;
4487       source_end = source + i;
4488       while (source < source_end)
4489         {
4490           ccl_driver (&ccl, source, charbuf,
4491                       source_end - source, charbuf_end - charbuf,
4492                       charset_list);
4493           source += ccl.consumed;
4494           charbuf += ccl.produced;
4495           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4496             break;
4497         }
4498       if (source < source_end)
4499         src += source_byteidx[source - source_charbuf];
4500       else
4501         src = p;
4502       consumed_chars += source - source_charbuf;
4503
4504       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4505           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4506         break;
4507     }
4508
4509   switch (ccl.status)
4510     {
4511     case CCL_STAT_SUSPEND_BY_SRC:
4512       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4513       break;
4514     case CCL_STAT_SUSPEND_BY_DST:
4515       break;
4516     case CCL_STAT_QUIT:
4517     case CCL_STAT_INVALID_CMD:
4518       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4519       break;
4520     default:
4521       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4522       break;
4523     }
4524   coding->consumed_char += consumed_chars;
4525   coding->consumed = src - coding->source;
4526   coding->charbuf_used = charbuf - coding->charbuf;
4527 }
4528
4529 static int
4530 encode_coding_ccl (coding)
4531      struct coding_system *coding;
4532 {
4533   struct ccl_program ccl;
4534   int multibytep = coding->dst_multibyte;
4535   int *charbuf = coding->charbuf;
4536   int *charbuf_end = charbuf + coding->charbuf_used;
4537   unsigned char *dst = coding->destination + coding->produced;
4538   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4539   unsigned char *adjusted_dst_end = dst_end - 1;
4540   int destination_charbuf[1024];
4541   int i, produced_chars = 0;
4542   Lisp_Object attrs, charset_list;
4543
4544   CODING_GET_INFO (coding, attrs, charset_list);
4545   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4546
4547   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4548   ccl.dst_multibyte = coding->dst_multibyte;
4549
4550   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4551     {
4552       int dst_bytes = dst_end - dst;
4553       if (dst_bytes > 1024)
4554         dst_bytes = 1024;
4555
4556       ccl_driver (&ccl, charbuf, destination_charbuf,
4557                   charbuf_end - charbuf, dst_bytes, charset_list);
4558       charbuf += ccl.consumed;
4559       if (multibytep)
4560         for (i = 0; i < ccl.produced; i++)
4561           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4562       else
4563         {
4564           for (i = 0; i < ccl.produced; i++)
4565             *dst++ = destination_charbuf[i] & 0xFF;
4566           produced_chars += ccl.produced;
4567         }
4568     }
4569
4570   switch (ccl.status)
4571     {
4572     case CCL_STAT_SUSPEND_BY_SRC:
4573       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4574       break;
4575     case CCL_STAT_SUSPEND_BY_DST:
4576       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4577       break;
4578     case CCL_STAT_QUIT:
4579     case CCL_STAT_INVALID_CMD:
4580       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4581       break;
4582     default:
4583       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4584       break;
4585     }
4586
4587   coding->produced_char += produced_chars;
4588   coding->produced = dst - coding->destination;
4589   return 0;
4590 }
4591
4592
4593 \f
4594 /*** 10, 11. no-conversion handlers ***/
4595
4596 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4597
4598 static void
4599 decode_coding_raw_text (coding)
4600      struct coding_system *coding;
4601 {
4602   coding->chars_at_source = 1;
4603   coding->consumed_char = 0;
4604   coding->consumed = 0;
4605   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4606 }
4607
4608 static int
4609 encode_coding_raw_text (coding)
4610      struct coding_system *coding;
4611 {
4612   int multibytep = coding->dst_multibyte;
4613   int *charbuf = coding->charbuf;
4614   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4615   unsigned char *dst = coding->destination + coding->produced;
4616   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4617   int produced_chars = 0;
4618   int c;
4619
4620   if (multibytep)
4621     {
4622       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4623
4624       if (coding->src_multibyte)
4625         while (charbuf < charbuf_end)
4626           {
4627             ASSURE_DESTINATION (safe_room);
4628             c = *charbuf++;
4629             if (ASCII_CHAR_P (c))
4630               EMIT_ONE_ASCII_BYTE (c);
4631             else if (CHAR_BYTE8_P (c))
4632               {
4633                 c = CHAR_TO_BYTE8 (c);
4634                 EMIT_ONE_BYTE (c);
4635               }
4636             else
4637               {
4638                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4639
4640                 CHAR_STRING_ADVANCE (c, p1);
4641                 while (p0 < p1)
4642                   {
4643                     EMIT_ONE_BYTE (*p0);
4644                     p0++;
4645                   }
4646               }
4647           }
4648       else
4649         while (charbuf < charbuf_end)
4650           {
4651             ASSURE_DESTINATION (safe_room);
4652             c = *charbuf++;
4653             EMIT_ONE_BYTE (c);
4654           }
4655     }
4656   else
4657     {
4658       if (coding->src_multibyte)
4659         {
4660           int safe_room = MAX_MULTIBYTE_LENGTH;
4661
4662           while (charbuf < charbuf_end)
4663             {
4664               ASSURE_DESTINATION (safe_room);
4665               c = *charbuf++;
4666               if (ASCII_CHAR_P (c))
4667                 *dst++ = c;
4668               else if (CHAR_BYTE8_P (c))
4669                 *dst++ = CHAR_TO_BYTE8 (c);
4670               else
4671                 CHAR_STRING_ADVANCE (c, dst);
4672               produced_chars++;
4673             }
4674         }
4675       else
4676         {
4677           ASSURE_DESTINATION (charbuf_end - charbuf);
4678           while (charbuf < charbuf_end && dst < dst_end)
4679             *dst++ = *charbuf++;
4680           produced_chars = dst - (coding->destination + coding->dst_bytes);
4681         }
4682     }
4683   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4684   coding->produced_char += produced_chars;
4685   coding->produced = dst - coding->destination;
4686   return 0;
4687 }
4688
4689 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4690    Check if a text is encoded in a charset-based coding system.  If it
4691    is, return 1, else return 0.  */
4692
4693 static int
4694 detect_coding_charset (coding, detect_info)
4695      struct coding_system *coding;
4696      struct coding_detection_info *detect_info;
4697 {
4698   const unsigned char *src = coding->source, *src_base;
4699   const unsigned char *src_end = coding->source + coding->src_bytes;
4700   int multibytep = coding->src_multibyte;
4701   int consumed_chars = 0;
4702   Lisp_Object attrs, valids;
4703   int found = 0;
4704
4705   detect_info->checked |= CATEGORY_MASK_CHARSET;
4706
4707   coding = &coding_categories[coding_category_charset];
4708   attrs = CODING_ID_ATTRS (coding->id);
4709   valids = AREF (attrs, coding_attr_charset_valids);
4710
4711   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4712     src += coding->head_ascii;
4713
4714   while (1)
4715     {
4716       int c;
4717
4718       src_base = src;
4719       ONE_MORE_BYTE (c);
4720       if (c < 0)
4721         continue;
4722       if (NILP (AREF (valids, c)))
4723         break;
4724       if (c >= 0x80)
4725         found = CATEGORY_MASK_CHARSET;
4726     }
4727   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4728   return 0;
4729
4730  no_more_source:
4731   detect_info->found |= found;
4732   return 1;
4733 }
4734
4735 static void
4736 decode_coding_charset (coding)
4737      struct coding_system *coding;
4738 {
4739   const unsigned char *src = coding->source + coding->consumed;
4740   const unsigned char *src_end = coding->source + coding->src_bytes;
4741   const unsigned char *src_base;
4742   int *charbuf = coding->charbuf + coding->charbuf_used;
4743   int *charbuf_end
4744     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4745   int consumed_chars = 0, consumed_chars_base;
4746   int multibytep = coding->src_multibyte;
4747   Lisp_Object attrs, charset_list, valids;
4748   int char_offset = coding->produced_char;
4749   int last_offset = char_offset;
4750   int last_id = charset_ascii;
4751
4752   CODING_GET_INFO (coding, attrs, charset_list);
4753   valids = AREF (attrs, coding_attr_charset_valids);
4754
4755   while (1)
4756     {
4757       int c;
4758       Lisp_Object val;
4759       struct charset *charset;
4760       int dim;
4761       int len = 1;
4762       unsigned code;
4763
4764       src_base = src;
4765       consumed_chars_base = consumed_chars;
4766
4767       if (charbuf >= charbuf_end)
4768         break;
4769
4770       ONE_MORE_BYTE (c);
4771       if (c < 0)
4772         goto invalid_code;
4773       code = c;
4774
4775       val = AREF (valids, c);
4776       if (NILP (val))
4777         goto invalid_code;
4778       if (INTEGERP (val))
4779         {
4780           charset = CHARSET_FROM_ID (XFASTINT (val));
4781           dim = CHARSET_DIMENSION (charset);
4782           while (len < dim)
4783             {
4784               ONE_MORE_BYTE (c);
4785               code = (code << 8) | c;
4786               len++;
4787             }
4788           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4789                               charset, code, c);
4790         }
4791       else
4792         {
4793           /* VAL is a list of charset IDs.  It is assured that the
4794              list is sorted by charset dimensions (smaller one
4795              comes first).  */
4796           while (CONSP (val))
4797             {
4798               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4799               dim = CHARSET_DIMENSION (charset);
4800               while (len < dim)
4801                 {
4802                   ONE_MORE_BYTE (c);
4803                   code = (code << 8) | c;
4804                   len++;
4805                 }
4806               CODING_DECODE_CHAR (coding, src, src_base,
4807                                   src_end, charset, code, c);
4808               if (c >= 0)
4809                 break;
4810               val = XCDR (val);
4811             }
4812         }
4813       if (c < 0)
4814         goto invalid_code;
4815       if (charset->id != charset_ascii
4816           && last_id != charset->id)
4817         {
4818           if (last_id != charset_ascii)
4819             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4820           last_id = charset->id;
4821           last_offset = char_offset;
4822         }
4823
4824       *charbuf++ = c;
4825       char_offset++;
4826       continue;
4827
4828     invalid_code:
4829       src = src_base;
4830       consumed_chars = consumed_chars_base;
4831       ONE_MORE_BYTE (c);
4832       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4833       char_offset++;
4834       coding->errors++;
4835     }
4836
4837  no_more_source:
4838   if (last_id != charset_ascii)
4839     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4840   coding->consumed_char += consumed_chars_base;
4841   coding->consumed = src_base - coding->source;
4842   coding->charbuf_used = charbuf - coding->charbuf;
4843 }
4844
4845 static int
4846 encode_coding_charset (coding)
4847      struct coding_system *coding;
4848 {
4849   int multibytep = coding->dst_multibyte;
4850   int *charbuf = coding->charbuf;
4851   int *charbuf_end = charbuf + coding->charbuf_used;
4852   unsigned char *dst = coding->destination + coding->produced;
4853   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4854   int safe_room = MAX_MULTIBYTE_LENGTH;
4855   int produced_chars = 0;
4856   Lisp_Object attrs, charset_list;
4857   int ascii_compatible;
4858   int c;
4859
4860   CODING_GET_INFO (coding, attrs, charset_list);
4861   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4862
4863   while (charbuf < charbuf_end)
4864     {
4865       struct charset *charset;
4866       unsigned code;
4867
4868       ASSURE_DESTINATION (safe_room);
4869       c = *charbuf++;
4870       if (ascii_compatible && ASCII_CHAR_P (c))
4871         EMIT_ONE_ASCII_BYTE (c);
4872       else if (CHAR_BYTE8_P (c))
4873         {
4874           c = CHAR_TO_BYTE8 (c);
4875           EMIT_ONE_BYTE (c);
4876         }
4877       else
4878         {
4879           charset = char_charset (c, charset_list, &code);
4880           if (charset)
4881             {
4882               if (CHARSET_DIMENSION (charset) == 1)
4883                 EMIT_ONE_BYTE (code);
4884               else if (CHARSET_DIMENSION (charset) == 2)
4885                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4886               else if (CHARSET_DIMENSION (charset) == 3)
4887                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4888               else
4889                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4890                                  (code >> 8) & 0xFF, code & 0xFF);
4891             }
4892           else
4893             {
4894               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4895                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4896               else
4897                 c = coding->default_char;
4898               EMIT_ONE_BYTE (c);
4899             }
4900         }
4901     }
4902
4903   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4904   coding->produced_char += produced_chars;
4905   coding->produced = dst - coding->destination;
4906   return 0;
4907 }
4908
4909 \f
4910 /*** 7. C library functions ***/
4911
4912 /* Setup coding context CODING from information about CODING_SYSTEM.
4913    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4914    CODING_SYSTEM is invalid, signal an error.  */
4915
4916 void
4917 setup_coding_system (coding_system, coding)
4918      Lisp_Object coding_system;
4919      struct coding_system *coding;
4920 {
4921   Lisp_Object attrs;
4922   Lisp_Object eol_type;
4923   Lisp_Object coding_type;
4924   Lisp_Object val;
4925
4926   if (NILP (coding_system))
4927     coding_system = Qundecided;
4928
4929   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4930
4931   attrs = CODING_ID_ATTRS (coding->id);
4932   eol_type = CODING_ID_EOL_TYPE (coding->id);
4933
4934   coding->mode = 0;
4935   coding->head_ascii = -1;
4936   coding->common_flags
4937     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4938   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4939     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4940   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4941     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4942   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4943     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4944
4945   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4946   coding->max_charset_id = SCHARS (val) - 1;
4947   coding->safe_charsets = (char *) SDATA (val);
4948   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4949
4950   coding_type = CODING_ATTR_TYPE (attrs);
4951   if (EQ (coding_type, Qundecided))
4952     {
4953       coding->detector = NULL;
4954       coding->decoder = decode_coding_raw_text;
4955       coding->encoder = encode_coding_raw_text;
4956       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4957     }
4958   else if (EQ (coding_type, Qiso_2022))
4959     {
4960       int i;
4961       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4962
4963       /* Invoke graphic register 0 to plane 0.  */
4964       CODING_ISO_INVOCATION (coding, 0) = 0;
4965       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4966       CODING_ISO_INVOCATION (coding, 1)
4967         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4968       /* Setup the initial status of designation.  */
4969       for (i = 0; i < 4; i++)
4970         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4971       /* Not single shifting initially.  */
4972       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4973       /* Beginning of buffer should also be regarded as bol. */
4974       CODING_ISO_BOL (coding) = 1;
4975       coding->detector = detect_coding_iso_2022;
4976       coding->decoder = decode_coding_iso_2022;
4977       coding->encoder = encode_coding_iso_2022;
4978       if (flags & CODING_ISO_FLAG_SAFE)
4979         coding->mode |= CODING_MODE_SAFE_ENCODING;
4980       coding->common_flags
4981         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4982             | CODING_REQUIRE_FLUSHING_MASK);
4983       if (flags & CODING_ISO_FLAG_COMPOSITION)
4984         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4985       if (flags & CODING_ISO_FLAG_DESIGNATION)
4986         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4987       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4988         {
4989           setup_iso_safe_charsets (attrs);
4990           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4991           coding->max_charset_id = SCHARS (val) - 1;
4992           coding->safe_charsets = (char *) SDATA (val);
4993         }
4994       CODING_ISO_FLAGS (coding) = flags;
4995     }
4996   else if (EQ (coding_type, Qcharset))
4997     {
4998       coding->detector = detect_coding_charset;
4999       coding->decoder = decode_coding_charset;
5000       coding->encoder = encode_coding_charset;
5001       coding->common_flags
5002         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5003     }
5004   else if (EQ (coding_type, Qutf_8))
5005     {
5006       coding->detector = detect_coding_utf_8;
5007       coding->decoder = decode_coding_utf_8;
5008       coding->encoder = encode_coding_utf_8;
5009       coding->common_flags
5010         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5011     }
5012   else if (EQ (coding_type, Qutf_16))
5013     {
5014       val = AREF (attrs, coding_attr_utf_16_bom);
5015       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5016                                     : EQ (val, Qt) ? utf_16_with_bom
5017                                     : utf_16_without_bom);
5018       val = AREF (attrs, coding_attr_utf_16_endian);
5019       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5020                                        : utf_16_little_endian);
5021       CODING_UTF_16_SURROGATE (coding) = 0;
5022       coding->detector = detect_coding_utf_16;
5023       coding->decoder = decode_coding_utf_16;
5024       coding->encoder = encode_coding_utf_16;
5025       coding->common_flags
5026         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5027       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5028         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5029     }
5030   else if (EQ (coding_type, Qccl))
5031     {
5032       coding->detector = detect_coding_ccl;
5033       coding->decoder = decode_coding_ccl;
5034       coding->encoder = encode_coding_ccl;
5035       coding->common_flags
5036         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5037             | CODING_REQUIRE_FLUSHING_MASK);
5038     }
5039   else if (EQ (coding_type, Qemacs_mule))
5040     {
5041       coding->detector = detect_coding_emacs_mule;
5042       coding->decoder = decode_coding_emacs_mule;
5043       coding->encoder = encode_coding_emacs_mule;
5044       coding->common_flags
5045         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5046       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5047           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5048         {
5049           Lisp_Object tail, safe_charsets;
5050           int max_charset_id = 0;
5051
5052           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5053                tail = XCDR (tail))
5054             if (max_charset_id < XFASTINT (XCAR (tail)))
5055               max_charset_id = XFASTINT (XCAR (tail));
5056           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5057                                         make_number (255));
5058           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5059                tail = XCDR (tail))
5060             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5061           coding->max_charset_id = max_charset_id;
5062           coding->safe_charsets = (char *) SDATA (safe_charsets);
5063         }
5064     }
5065   else if (EQ (coding_type, Qshift_jis))
5066     {
5067       coding->detector = detect_coding_sjis;
5068       coding->decoder = decode_coding_sjis;
5069       coding->encoder = encode_coding_sjis;
5070       coding->common_flags
5071         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5072     }
5073   else if (EQ (coding_type, Qbig5))
5074     {
5075       coding->detector = detect_coding_big5;
5076       coding->decoder = decode_coding_big5;
5077       coding->encoder = encode_coding_big5;
5078       coding->common_flags
5079         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5080     }
5081   else                          /* EQ (coding_type, Qraw_text) */
5082     {
5083       coding->detector = NULL;
5084       coding->decoder = decode_coding_raw_text;
5085       coding->encoder = encode_coding_raw_text;
5086       if (! EQ (eol_type, Qunix))
5087         {
5088           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5089           if (! VECTORP (eol_type))
5090             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5091         }
5092
5093     }
5094
5095   return;
5096 }
5097
5098 /* Return a list of charsets supported by CODING.  */
5099
5100 Lisp_Object
5101 coding_charset_list (coding)
5102      struct coding_system *coding;
5103 {
5104   Lisp_Object attrs, charset_list;
5105
5106   CODING_GET_INFO (coding, attrs, charset_list);
5107   if (EQ (CODING_ATTR_TYPE (attrs), Qiso_2022))
5108     {
5109       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
5110
5111       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
5112         charset_list = Viso_2022_charset_list;
5113     }
5114   else if (EQ (CODING_ATTR_TYPE (attrs), Qemacs_mule))
5115     {
5116       charset_list = Vemacs_mule_charset_list;
5117     }
5118   return charset_list;
5119 }
5120
5121
5122 /* Return raw-text or one of its subsidiaries that has the same
5123    eol_type as CODING-SYSTEM.  */
5124
5125 Lisp_Object
5126 raw_text_coding_system (coding_system)
5127      Lisp_Object coding_system;
5128 {
5129   Lisp_Object spec, attrs;
5130   Lisp_Object eol_type, raw_text_eol_type;
5131
5132   if (NILP (coding_system))
5133     return Qraw_text;
5134   spec = CODING_SYSTEM_SPEC (coding_system);
5135   attrs = AREF (spec, 0);
5136
5137   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5138     return coding_system;
5139
5140   eol_type = AREF (spec, 2);
5141   if (VECTORP (eol_type))
5142     return Qraw_text;
5143   spec = CODING_SYSTEM_SPEC (Qraw_text);
5144   raw_text_eol_type = AREF (spec, 2);
5145   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5146           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5147           : AREF (raw_text_eol_type, 2));
5148 }
5149
5150
5151 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5152    does, return one of the subsidiary that has the same eol-spec as
5153    PARENT.  Otherwise, return CODING_SYSTEM.  */
5154
5155 Lisp_Object
5156 coding_inherit_eol_type (coding_system, parent)
5157      Lisp_Object coding_system, parent;
5158 {
5159   Lisp_Object spec, eol_type;
5160
5161   if (NILP (coding_system))
5162     coding_system = Qraw_text;
5163   spec = CODING_SYSTEM_SPEC (coding_system);
5164   eol_type = AREF (spec, 2);
5165   if (VECTORP (eol_type)
5166       && ! NILP (parent))
5167     {
5168       Lisp_Object parent_spec;
5169       Lisp_Object parent_eol_type;
5170
5171       parent_spec
5172         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5173       parent_eol_type = AREF (parent_spec, 2);
5174       if (EQ (parent_eol_type, Qunix))
5175         coding_system = AREF (eol_type, 0);
5176       else if (EQ (parent_eol_type, Qdos))
5177         coding_system = AREF (eol_type, 1);
5178       else if (EQ (parent_eol_type, Qmac))
5179         coding_system = AREF (eol_type, 2);
5180     }
5181   return coding_system;
5182 }
5183
5184 /* Emacs has a mechanism to automatically detect a coding system if it
5185    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5186    it's impossible to distinguish some coding systems accurately
5187    because they use the same range of codes.  So, at first, coding
5188    systems are categorized into 7, those are:
5189
5190    o coding-category-emacs-mule
5191
5192         The category for a coding system which has the same code range
5193         as Emacs' internal format.  Assigned the coding-system (Lisp
5194         symbol) `emacs-mule' by default.
5195
5196    o coding-category-sjis
5197
5198         The category for a coding system which has the same code range
5199         as SJIS.  Assigned the coding-system (Lisp
5200         symbol) `japanese-shift-jis' by default.
5201
5202    o coding-category-iso-7
5203
5204         The category for a coding system which has the same code range
5205         as ISO2022 of 7-bit environment.  This doesn't use any locking
5206         shift and single shift functions.  This can encode/decode all
5207         charsets.  Assigned the coding-system (Lisp symbol)
5208         `iso-2022-7bit' by default.
5209
5210    o coding-category-iso-7-tight
5211
5212         Same as coding-category-iso-7 except that this can
5213         encode/decode only the specified charsets.
5214
5215    o coding-category-iso-8-1
5216
5217         The category for a coding system which has the same code range
5218         as ISO2022 of 8-bit environment and graphic plane 1 used only
5219         for DIMENSION1 charset.  This doesn't use any locking shift
5220         and single shift functions.  Assigned the coding-system (Lisp
5221         symbol) `iso-latin-1' by default.
5222
5223    o coding-category-iso-8-2
5224
5225         The category for a coding system which has the same code range
5226         as ISO2022 of 8-bit environment and graphic plane 1 used only
5227         for DIMENSION2 charset.  This doesn't use any locking shift
5228         and single shift functions.  Assigned the coding-system (Lisp
5229         symbol) `japanese-iso-8bit' by default.
5230
5231    o coding-category-iso-7-else
5232
5233         The category for a coding system which has the same code range
5234         as ISO2022 of 7-bit environemnt but uses locking shift or
5235         single shift functions.  Assigned the coding-system (Lisp
5236         symbol) `iso-2022-7bit-lock' by default.
5237
5238    o coding-category-iso-8-else
5239
5240         The category for a coding system which has the same code range
5241         as ISO2022 of 8-bit environemnt but uses locking shift or
5242         single shift functions.  Assigned the coding-system (Lisp
5243         symbol) `iso-2022-8bit-ss2' by default.
5244
5245    o coding-category-big5
5246
5247         The category for a coding system which has the same code range
5248         as BIG5.  Assigned the coding-system (Lisp symbol)
5249         `cn-big5' by default.
5250
5251    o coding-category-utf-8
5252
5253         The category for a coding system which has the same code range
5254         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5255         symbol) `utf-8' by default.
5256
5257    o coding-category-utf-16-be
5258
5259         The category for a coding system in which a text has an
5260         Unicode signature (cf. Unicode Standard) in the order of BIG
5261         endian at the head.  Assigned the coding-system (Lisp symbol)
5262         `utf-16-be' by default.
5263
5264    o coding-category-utf-16-le
5265
5266         The category for a coding system in which a text has an
5267         Unicode signature (cf. Unicode Standard) in the order of
5268         LITTLE endian at the head.  Assigned the coding-system (Lisp
5269         symbol) `utf-16-le' by default.
5270
5271    o coding-category-ccl
5272
5273         The category for a coding system of which encoder/decoder is
5274         written in CCL programs.  The default value is nil, i.e., no
5275         coding system is assigned.
5276
5277    o coding-category-binary
5278
5279         The category for a coding system not categorized in any of the
5280         above.  Assigned the coding-system (Lisp symbol)
5281         `no-conversion' by default.
5282
5283    Each of them is a Lisp symbol and the value is an actual
5284    `coding-system's (this is also a Lisp symbol) assigned by a user.
5285    What Emacs does actually is to detect a category of coding system.
5286    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5287    decide only one possible category, it selects a category of the
5288    highest priority.  Priorities of categories are also specified by a
5289    user in a Lisp variable `coding-category-list'.
5290
5291 */
5292
5293 #define EOL_SEEN_NONE   0
5294 #define EOL_SEEN_LF     1
5295 #define EOL_SEEN_CR     2
5296 #define EOL_SEEN_CRLF   4
5297
5298 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5299    SOURCE is encoded.  If CATEGORY is one of
5300    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5301    two-byte, else they are encoded by one-byte.
5302
5303    Return one of EOL_SEEN_XXX.  */
5304
5305 #define MAX_EOL_CHECK_COUNT 3
5306
5307 static int
5308 detect_eol (source, src_bytes, category)
5309      const unsigned char *source;
5310      EMACS_INT src_bytes;
5311      enum coding_category category;
5312 {
5313   const unsigned char *src = source, *src_end = src + src_bytes;
5314   unsigned char c;
5315   int total  = 0;
5316   int eol_seen = EOL_SEEN_NONE;
5317
5318   if ((1 << category) & CATEGORY_MASK_UTF_16)
5319     {
5320       int msb, lsb;
5321
5322       msb = category == (coding_category_utf_16_le
5323                          | coding_category_utf_16_le_nosig);
5324       lsb = 1 - msb;
5325
5326       while (src + 1 < src_end)
5327         {
5328           c = src[lsb];
5329           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5330             {
5331               int this_eol;
5332
5333               if (c == '\n')
5334                 this_eol = EOL_SEEN_LF;
5335               else if (src + 3 >= src_end
5336                        || src[msb + 2] != 0
5337                        || src[lsb + 2] != '\n')
5338                 this_eol = EOL_SEEN_CR;
5339               else
5340                 this_eol = EOL_SEEN_CRLF;
5341
5342               if (eol_seen == EOL_SEEN_NONE)
5343                 /* This is the first end-of-line.  */
5344                 eol_seen = this_eol;
5345               else if (eol_seen != this_eol)
5346                 {
5347                   /* The found type is different from what found before.  */
5348                   eol_seen = EOL_SEEN_LF;
5349                   break;
5350                 }
5351               if (++total == MAX_EOL_CHECK_COUNT)
5352                 break;
5353             }
5354           src += 2;
5355         }
5356     }
5357   else
5358     {
5359       while (src < src_end)
5360         {
5361           c = *src++;
5362           if (c == '\n' || c == '\r')
5363             {
5364               int this_eol;
5365
5366               if (c == '\n')
5367                 this_eol = EOL_SEEN_LF;
5368               else if (src >= src_end || *src != '\n')
5369                 this_eol = EOL_SEEN_CR;
5370               else
5371                 this_eol = EOL_SEEN_CRLF, src++;
5372
5373               if (eol_seen == EOL_SEEN_NONE)
5374                 /* This is the first end-of-line.  */
5375                 eol_seen = this_eol;
5376               else if (eol_seen != this_eol)
5377                 {
5378                   /* The found type is different from what found before.  */
5379                   eol_seen = EOL_SEEN_LF;
5380                   break;
5381                 }
5382               if (++total == MAX_EOL_CHECK_COUNT)
5383                 break;
5384             }
5385         }
5386     }
5387   return eol_seen;
5388 }
5389
5390
5391 static Lisp_Object
5392 adjust_coding_eol_type (coding, eol_seen)
5393      struct coding_system *coding;
5394      int eol_seen;
5395 {
5396   Lisp_Object eol_type;
5397
5398   eol_type = CODING_ID_EOL_TYPE (coding->id);
5399   if (eol_seen & EOL_SEEN_LF)
5400     {
5401       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5402       eol_type = Qunix;
5403     }
5404   else if (eol_seen & EOL_SEEN_CRLF)
5405     {
5406       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5407       eol_type = Qdos;
5408     }
5409   else if (eol_seen & EOL_SEEN_CR)
5410     {
5411       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5412       eol_type = Qmac;
5413     }
5414   return eol_type;
5415 }
5416
5417 /* Detect how a text specified in CODING is encoded.  If a coding
5418    system is detected, update fields of CODING by the detected coding
5419    system.  */
5420
5421 void
5422 detect_coding (coding)
5423      struct coding_system *coding;
5424 {
5425   const unsigned char *src, *src_end;
5426
5427   coding->consumed = coding->consumed_char = 0;
5428   coding->produced = coding->produced_char = 0;
5429   coding_set_source (coding);
5430
5431   src_end = coding->source + coding->src_bytes;
5432
5433   /* If we have not yet decided the text encoding type, detect it
5434      now.  */
5435   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5436     {
5437       int c, i;
5438       struct coding_detection_info detect_info;
5439
5440       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5441       for (i = 0, src = coding->source; src < src_end; i++, src++)
5442         {
5443           c = *src;
5444           if (c & 0x80)
5445             break;
5446           if (c < 0x20
5447               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5448               && ! inhibit_iso_escape_detection
5449               && ! detect_info.checked)
5450             {
5451               coding->head_ascii = src - (coding->source + coding->consumed);
5452               if (detect_coding_iso_2022 (coding, &detect_info))
5453                 {
5454                   /* We have scanned the whole data.  */
5455                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5456                     /* We didn't find an 8-bit code.  */
5457                     src = src_end;
5458                   break;
5459                 }
5460             }
5461         }
5462       coding->head_ascii = src - (coding->source + coding->consumed);
5463
5464       if (coding->head_ascii < coding->src_bytes
5465           || detect_info.found)
5466         {
5467           enum coding_category category;
5468           struct coding_system *this;
5469
5470           if (coding->head_ascii == coding->src_bytes)
5471             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5472             for (i = 0; i < coding_category_raw_text; i++)
5473               {
5474                 category = coding_priorities[i];
5475                 this = coding_categories + category;
5476                 if (detect_info.found & (1 << category))
5477                   break;
5478               }
5479           else
5480             for (i = 0; i < coding_category_raw_text; i++)
5481               {
5482                 category = coding_priorities[i];
5483                 this = coding_categories + category;
5484                 if (this->id < 0)
5485                   {
5486                     /* No coding system of this category is defined.  */
5487                     detect_info.rejected |= (1 << category);
5488                   }
5489                 else if (category >= coding_category_raw_text)
5490                   continue;
5491                 else if (detect_info.checked & (1 << category))
5492                   {
5493                     if (detect_info.found & (1 << category))
5494                       break;
5495                   }
5496                 else if ((*(this->detector)) (coding, &detect_info)
5497                          && detect_info.found & (1 << category))
5498                   {
5499                     if (category == coding_category_utf_16_auto)
5500                       {
5501                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5502                           category = coding_category_utf_16_le;
5503                         else
5504                           category = coding_category_utf_16_be;
5505                       }
5506                     break;
5507                   }
5508               }
5509
5510           if (i < coding_category_raw_text)
5511             setup_coding_system (CODING_ID_NAME (this->id), coding);
5512           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5513             setup_coding_system (Qraw_text, coding);
5514           else if (detect_info.rejected)
5515             for (i = 0; i < coding_category_raw_text; i++)
5516               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5517                 {
5518                   this = coding_categories + coding_priorities[i];
5519                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5520                   break;
5521                 }
5522         }
5523     }
5524   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5525            == coding_category_utf_16_auto)
5526     {
5527       Lisp_Object coding_systems;
5528       struct coding_detection_info detect_info;
5529
5530       coding_systems
5531         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5532       detect_info.found = detect_info.rejected = 0;
5533       if (CONSP (coding_systems)
5534           && detect_coding_utf_16 (coding, &detect_info))
5535         {
5536           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5537             setup_coding_system (XCAR (coding_systems), coding);
5538           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5539             setup_coding_system (XCDR (coding_systems), coding);
5540         }
5541     }
5542 }
5543
5544
5545 static void
5546 decode_eol (coding)
5547      struct coding_system *coding;
5548 {
5549   Lisp_Object eol_type;
5550   unsigned char *p, *pbeg, *pend;
5551
5552   eol_type = CODING_ID_EOL_TYPE (coding->id);
5553   if (EQ (eol_type, Qunix))
5554     return;
5555
5556   if (NILP (coding->dst_object))
5557     pbeg = coding->destination;
5558   else
5559     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5560   pend = pbeg + coding->produced;
5561
5562   if (VECTORP (eol_type))
5563     {
5564       int eol_seen = EOL_SEEN_NONE;
5565
5566       for (p = pbeg; p < pend; p++)
5567         {
5568           if (*p == '\n')
5569             eol_seen |= EOL_SEEN_LF;
5570           else if (*p == '\r')
5571             {
5572               if (p + 1 < pend && *(p + 1) == '\n')
5573                 {
5574                   eol_seen |= EOL_SEEN_CRLF;
5575                   p++;
5576                 }
5577               else
5578                 eol_seen |= EOL_SEEN_CR;
5579             }
5580         }
5581       if (eol_seen != EOL_SEEN_NONE
5582           && eol_seen != EOL_SEEN_LF
5583           && eol_seen != EOL_SEEN_CRLF
5584           && eol_seen != EOL_SEEN_CR)
5585         eol_seen = EOL_SEEN_LF;
5586       if (eol_seen != EOL_SEEN_NONE)
5587         eol_type = adjust_coding_eol_type (coding, eol_seen);
5588     }
5589
5590   if (EQ (eol_type, Qmac))
5591     {
5592       for (p = pbeg; p < pend; p++)
5593         if (*p == '\r')
5594           *p = '\n';
5595     }
5596   else if (EQ (eol_type, Qdos))
5597     {
5598       int n = 0;
5599
5600       if (NILP (coding->dst_object))
5601         {
5602           for (p = pend - 2; p >= pbeg; p--)
5603             if (*p == '\r')
5604               {
5605                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5606                 n++;
5607               }
5608         }
5609       else
5610         {
5611           for (p = pend - 2; p >= pbeg; p--)
5612             if (*p == '\r')
5613               {
5614                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5615                 int pos = BYTE_TO_CHAR (pos_byte);
5616
5617                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5618                 n++;
5619               }
5620         }
5621       coding->produced -= n;
5622       coding->produced_char -= n;
5623     }
5624 }
5625
5626
5627 /* Return a translation table (or list of them) from coding system
5628    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5629    decoding (ENCODEP is zero). */
5630
5631 static Lisp_Object
5632 get_translation_table (attrs, encodep, max_lookup)
5633      Lisp_Object attrs;
5634      int encodep, *max_lookup;
5635 {
5636   Lisp_Object standard, translation_table;
5637   Lisp_Object val;
5638
5639   if (encodep)
5640     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5641       standard = Vstandard_translation_table_for_encode;
5642   else
5643     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5644       standard = Vstandard_translation_table_for_decode;
5645   if (NILP (translation_table))
5646     translation_table = standard;
5647   else
5648     {
5649       if (SYMBOLP (translation_table))
5650         translation_table = Fget (translation_table, Qtranslation_table);
5651       else if (CONSP (translation_table))
5652         {
5653           translation_table = Fcopy_sequence (translation_table);
5654           for (val = translation_table; CONSP (val); val = XCDR (val))
5655             if (SYMBOLP (XCAR (val)))
5656               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5657         }
5658       if (CHAR_TABLE_P (standard))
5659         {
5660           if (CONSP (translation_table))
5661             translation_table = nconc2 (translation_table,
5662                                         Fcons (standard, Qnil));
5663           else
5664             translation_table = Fcons (translation_table,
5665                                        Fcons (standard, Qnil));
5666         }
5667     }
5668
5669   if (max_lookup)
5670     {
5671       *max_lookup = 1;
5672       if (CHAR_TABLE_P (translation_table)
5673           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5674         {
5675           val = XCHAR_TABLE (translation_table)->extras[1];
5676           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5677             *max_lookup = XFASTINT (val);
5678         }
5679       else if (CONSP (translation_table))
5680         {
5681           Lisp_Object tail, val;
5682
5683           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5684             if (CHAR_TABLE_P (XCAR (tail))
5685                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5686               {
5687                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5688                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5689                   *max_lookup = XFASTINT (val);
5690               }
5691         }
5692     }
5693   return translation_table;
5694 }
5695
5696 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5697   do {                                                          \
5698     trans = Qnil;                                               \
5699     if (CHAR_TABLE_P (table))                                   \
5700       {                                                         \
5701         trans = CHAR_TABLE_REF (table, c);                      \
5702         if (CHARACTERP (trans))                                 \
5703           c = XFASTINT (trans), trans = Qnil;                   \
5704       }                                                         \
5705     else if (CONSP (table))                                     \
5706       {                                                         \
5707         Lisp_Object tail;                                       \
5708                                                                 \
5709         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5710           if (CHAR_TABLE_P (XCAR (tail)))                       \
5711             {                                                   \
5712               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5713               if (CHARACTERP (trans))                           \
5714                 c = XFASTINT (trans), trans = Qnil;             \
5715               else if (! NILP (trans))                          \
5716                 break;                                          \
5717             }                                                   \
5718       }                                                         \
5719   } while (0)
5720
5721
5722 static Lisp_Object
5723 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5724      Lisp_Object val;
5725      int *buf, *buf_end;
5726      int last_block;
5727      int *from_nchars, *to_nchars;
5728 {
5729   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5730      [TO-CHAR ...].  */
5731   if (CONSP (val))
5732     {
5733       Lisp_Object from, tail;
5734       int i, len;
5735
5736       for (tail = val; CONSP (tail); tail = XCDR (tail))
5737         {
5738           val = XCAR (tail);
5739           from = XCAR (val);
5740           len = ASIZE (from);
5741           for (i = 0; i < len; i++)
5742             {
5743               if (buf + i == buf_end)
5744                 {
5745                   if (! last_block)
5746                     return Qt;
5747                   break;
5748                 }
5749               if (XINT (AREF (from, i)) != buf[i])
5750                 break;
5751             }
5752           if (i == len)
5753             {
5754               val = XCDR (val);
5755               *from_nchars = len;
5756               break;
5757             }
5758         }
5759       if (! CONSP (tail))
5760         return Qnil;
5761     }
5762   if (VECTORP (val))
5763     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5764   else
5765     *buf = XINT (val);
5766   return val;
5767 }
5768
5769
5770 static int
5771 produce_chars (coding, translation_table, last_block)
5772      struct coding_system *coding;
5773      Lisp_Object translation_table;
5774      int last_block;
5775 {
5776   unsigned char *dst = coding->destination + coding->produced;
5777   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5778   int produced;
5779   int produced_chars = 0;
5780   int carryover = 0;
5781
5782   if (! coding->chars_at_source)
5783     {
5784       /* Characters are in coding->charbuf.  */
5785       int *buf = coding->charbuf;
5786       int *buf_end = buf + coding->charbuf_used;
5787
5788       if (BUFFERP (coding->src_object)
5789           && EQ (coding->src_object, coding->dst_object))
5790         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5791
5792       while (buf < buf_end)
5793         {
5794           int c = *buf, i;
5795
5796           if (c >= 0)
5797             {
5798               int from_nchars = 1, to_nchars = 1;
5799               Lisp_Object trans = Qnil;
5800
5801               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5802               if (! NILP (trans))
5803                 {
5804                   trans = get_translation (trans, buf, buf_end, last_block,
5805                                            &from_nchars, &to_nchars);
5806                   if (EQ (trans, Qt))
5807                     break;
5808                   c = *buf;
5809                 }
5810
5811               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5812                 {
5813                   dst = alloc_destination (coding,
5814                                            buf_end - buf
5815                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5816                                            dst);
5817                   dst_end = coding->destination + coding->dst_bytes;
5818                 }
5819
5820               for (i = 0; i < to_nchars; i++)
5821                 {
5822                   if (i > 0)
5823                     c = XINT (AREF (trans, i));
5824                   if (coding->dst_multibyte
5825                       || ! CHAR_BYTE8_P (c))
5826                     CHAR_STRING_ADVANCE (c, dst);
5827                   else
5828                     *dst++ = CHAR_TO_BYTE8 (c);
5829                 }
5830               produced_chars += to_nchars;
5831               *buf++ = to_nchars;
5832               while (--from_nchars > 0)
5833                 *buf++ = 0;
5834             }
5835           else
5836             /* This is an annotation datum.  (-C) is the length.  */
5837             buf += -c;
5838         }
5839       carryover = buf_end - buf;
5840     }
5841   else
5842     {
5843       const unsigned char *src = coding->source;
5844       const unsigned char *src_end = src + coding->src_bytes;
5845       Lisp_Object eol_type;
5846
5847       eol_type = CODING_ID_EOL_TYPE (coding->id);
5848
5849       if (coding->src_multibyte != coding->dst_multibyte)
5850         {
5851           if (coding->src_multibyte)
5852             {
5853               int multibytep = 1;
5854               int consumed_chars;
5855
5856               while (1)
5857                 {
5858                   const unsigned char *src_base = src;
5859                   int c;
5860
5861                   ONE_MORE_BYTE (c);
5862                   if (c == '\r')
5863                     {
5864                       if (EQ (eol_type, Qdos))
5865                         {
5866                           if (src == src_end)
5867                             {
5868                               record_conversion_result
5869                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5870                               goto no_more_source;
5871                             }
5872                           if (*src == '\n')
5873                             c = *src++;
5874                         }
5875                       else if (EQ (eol_type, Qmac))
5876                         c = '\n';
5877                     }
5878                   if (dst == dst_end)
5879                     {
5880                       coding->consumed = src - coding->source;
5881
5882                     if (EQ (coding->src_object, coding->dst_object))
5883                       dst_end = (unsigned char *) src;
5884                     if (dst == dst_end)
5885                       {
5886                         dst = alloc_destination (coding, src_end - src + 1,
5887                                                  dst);
5888                         dst_end = coding->destination + coding->dst_bytes;
5889                         coding_set_source (coding);
5890                         src = coding->source + coding->consumed;
5891                         src_end = coding->source + coding->src_bytes;
5892                       }
5893                     }
5894                   *dst++ = c;
5895                   produced_chars++;
5896                 }
5897             no_more_source:
5898               ;
5899             }
5900           else
5901             while (src < src_end)
5902               {
5903                 int multibytep = 1;
5904                 int c = *src++;
5905
5906                 if (c == '\r')
5907                   {
5908                     if (EQ (eol_type, Qdos))
5909                       {
5910                         if (src < src_end
5911                             && *src == '\n')
5912                           c = *src++;
5913                       }
5914                     else if (EQ (eol_type, Qmac))
5915                       c = '\n';
5916                   }
5917                 if (dst >= dst_end - 1)
5918                   {
5919                     coding->consumed = src - coding->source;
5920
5921                     if (EQ (coding->src_object, coding->dst_object))
5922                       dst_end = (unsigned char *) src;
5923                     if (dst >= dst_end - 1)
5924                       {
5925                         dst = alloc_destination (coding, src_end - src + 2,
5926                                                  dst);
5927                         dst_end = coding->destination + coding->dst_bytes;
5928                         coding_set_source (coding);
5929                         src = coding->source + coding->consumed;
5930                         src_end = coding->source + coding->src_bytes;
5931                       }
5932                   }
5933                 EMIT_ONE_BYTE (c);
5934               }
5935         }
5936       else
5937         {
5938           if (!EQ (coding->src_object, coding->dst_object))
5939             {
5940               int require = coding->src_bytes - coding->dst_bytes;
5941
5942               if (require > 0)
5943                 {
5944                   EMACS_INT offset = src - coding->source;
5945
5946                   dst = alloc_destination (coding, require, dst);
5947                   coding_set_source (coding);
5948                   src = coding->source + offset;
5949                   src_end = coding->source + coding->src_bytes;
5950                 }
5951             }
5952           produced_chars = coding->src_chars;
5953           while (src < src_end)
5954             {
5955               int c = *src++;
5956
5957               if (c == '\r')
5958                 {
5959                   if (EQ (eol_type, Qdos))
5960                     {
5961                       if (src < src_end
5962                           && *src == '\n')
5963                         c = *src++;
5964                       produced_chars--;
5965                     }
5966                   else if (EQ (eol_type, Qmac))
5967                     c = '\n';
5968                 }
5969               *dst++ = c;
5970             }
5971         }
5972       coding->consumed = coding->src_bytes;
5973       coding->consumed_char = coding->src_chars;
5974     }
5975
5976   produced = dst - (coding->destination + coding->produced);
5977   if (BUFFERP (coding->dst_object))
5978     insert_from_gap (produced_chars, produced);
5979   coding->produced += produced;
5980   coding->produced_char += produced_chars;
5981   return carryover;
5982 }
5983
5984 /* Compose text in CODING->object according to the annotation data at
5985    CHARBUF.  CHARBUF is an array:
5986      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5987  */
5988
5989 static INLINE void
5990 produce_composition (coding, charbuf, pos)
5991      struct coding_system *coding;
5992      int *charbuf;
5993      EMACS_INT pos;
5994 {
5995   int len;
5996   EMACS_INT to;
5997   enum composition_method method;
5998   Lisp_Object components;
5999
6000   len = -charbuf[0];
6001   to = pos + charbuf[2];
6002   if (to <= pos)
6003     return;
6004   method = (enum composition_method) (charbuf[3]);
6005
6006   if (method == COMPOSITION_RELATIVE)
6007     components = Qnil;
6008   else if (method >= COMPOSITION_WITH_RULE
6009            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
6010     {
6011       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
6012       int i;
6013
6014       len -= 4;
6015       charbuf += 4;
6016       for (i = 0; i < len; i++)
6017         {
6018           args[i] = make_number (charbuf[i]);
6019           if (args[i] < 0)
6020             return;
6021         }
6022       components = (method == COMPOSITION_WITH_ALTCHARS
6023                     ? Fstring (len, args) : Fvector (len, args));
6024     }
6025   else
6026     return;
6027   compose_text (pos, to, components, Qnil, coding->dst_object);
6028 }
6029
6030
6031 /* Put `charset' property on text in CODING->object according to
6032    the annotation data at CHARBUF.  CHARBUF is an array:
6033      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6034  */
6035
6036 static INLINE void
6037 produce_charset (coding, charbuf, pos)
6038      struct coding_system *coding;
6039      int *charbuf;
6040      EMACS_INT pos;
6041 {
6042   EMACS_INT from = pos - charbuf[2];
6043   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6044
6045   Fput_text_property (make_number (from), make_number (pos),
6046                       Qcharset, CHARSET_NAME (charset),
6047                       coding->dst_object);
6048 }
6049
6050
6051 #define CHARBUF_SIZE 0x4000
6052
6053 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6054   do {                                                                  \
6055     int size = CHARBUF_SIZE;;                                           \
6056                                                                         \
6057     coding->charbuf = NULL;                                             \
6058     while (size > 1024)                                                 \
6059       {                                                                 \
6060         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6061         if (coding->charbuf)                                            \
6062           break;                                                        \
6063         size >>= 1;                                                     \
6064       }                                                                 \
6065     if (! coding->charbuf)                                              \
6066       {                                                                 \
6067         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6068         return coding->result;                                          \
6069       }                                                                 \
6070     coding->charbuf_size = size;                                        \
6071   } while (0)
6072
6073
6074 static void
6075 produce_annotation (coding, pos)
6076      struct coding_system *coding;
6077      EMACS_INT pos;
6078 {
6079   int *charbuf = coding->charbuf;
6080   int *charbuf_end = charbuf + coding->charbuf_used;
6081
6082   if (NILP (coding->dst_object))
6083     return;
6084
6085   while (charbuf < charbuf_end)
6086     {
6087       if (*charbuf >= 0)
6088         pos += *charbuf++;
6089       else
6090         {
6091           int len = -*charbuf;
6092           switch (charbuf[1])
6093             {
6094             case CODING_ANNOTATE_COMPOSITION_MASK:
6095               produce_composition (coding, charbuf, pos);
6096               break;
6097             case CODING_ANNOTATE_CHARSET_MASK:
6098               produce_charset (coding, charbuf, pos);
6099               break;
6100             default:
6101               abort ();
6102             }
6103           charbuf += len;
6104         }
6105     }
6106 }
6107
6108 /* Decode the data at CODING->src_object into CODING->dst_object.
6109    CODING->src_object is a buffer, a string, or nil.
6110    CODING->dst_object is a buffer.
6111
6112    If CODING->src_object is a buffer, it must be the current buffer.
6113    In this case, if CODING->src_pos is positive, it is a position of
6114    the source text in the buffer, otherwise, the source text is in the
6115    gap area of the buffer, and CODING->src_pos specifies the offset of
6116    the text from GPT (which must be the same as PT).  If this is the
6117    same buffer as CODING->dst_object, CODING->src_pos must be
6118    negative.
6119
6120    If CODING->src_object is a string, CODING->src_pos in an index to
6121    that string.
6122
6123    If CODING->src_object is nil, CODING->source must already point to
6124    the non-relocatable memory area.  In this case, CODING->src_pos is
6125    an offset from CODING->source.
6126
6127    The decoded data is inserted at the current point of the buffer
6128    CODING->dst_object.
6129 */
6130
6131 static int
6132 decode_coding (coding)
6133      struct coding_system *coding;
6134 {
6135   Lisp_Object attrs;
6136   Lisp_Object undo_list;
6137   Lisp_Object translation_table;
6138   int carryover;
6139   int i;
6140
6141   if (BUFFERP (coding->src_object)
6142       && coding->src_pos > 0
6143       && coding->src_pos < GPT
6144       && coding->src_pos + coding->src_chars > GPT)
6145     move_gap_both (coding->src_pos, coding->src_pos_byte);
6146
6147   undo_list = Qt;
6148   if (BUFFERP (coding->dst_object))
6149     {
6150       if (current_buffer != XBUFFER (coding->dst_object))
6151         set_buffer_internal (XBUFFER (coding->dst_object));
6152       if (GPT != PT)
6153         move_gap_both (PT, PT_BYTE);
6154       undo_list = current_buffer->undo_list;
6155       current_buffer->undo_list = Qt;
6156     }
6157
6158   coding->consumed = coding->consumed_char = 0;
6159   coding->produced = coding->produced_char = 0;
6160   coding->chars_at_source = 0;
6161   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6162   coding->errors = 0;
6163
6164   ALLOC_CONVERSION_WORK_AREA (coding);
6165
6166   attrs = CODING_ID_ATTRS (coding->id);
6167   translation_table = get_translation_table (attrs, 0, NULL);
6168
6169   carryover = 0;
6170   do
6171     {
6172       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6173
6174       coding_set_source (coding);
6175       coding->annotated = 0;
6176       coding->charbuf_used = carryover;
6177       (*(coding->decoder)) (coding);
6178       coding_set_destination (coding);
6179       carryover = produce_chars (coding, translation_table, 0);
6180       if (coding->annotated)
6181         produce_annotation (coding, pos);
6182       for (i = 0; i < carryover; i++)
6183         coding->charbuf[i]
6184           = coding->charbuf[coding->charbuf_used - carryover + i];
6185     }
6186   while (coding->consumed < coding->src_bytes
6187          && ! coding->result);
6188
6189   if (carryover > 0)
6190     {
6191       coding_set_destination (coding);
6192       coding->charbuf_used = carryover;
6193       produce_chars (coding, translation_table, 1);
6194     }
6195
6196   coding->carryover_bytes = 0;
6197   if (coding->consumed < coding->src_bytes)
6198     {
6199       int nbytes = coding->src_bytes - coding->consumed;
6200       const unsigned char *src;
6201
6202       coding_set_source (coding);
6203       coding_set_destination (coding);
6204       src = coding->source + coding->consumed;
6205
6206       if (coding->mode & CODING_MODE_LAST_BLOCK)
6207         {
6208           /* Flush out unprocessed data as binary chars.  We are sure
6209              that the number of data is less than the size of
6210              coding->charbuf.  */
6211           coding->charbuf_used = 0;
6212           while (nbytes-- > 0)
6213             {
6214               int c = *src++;
6215
6216               if (c & 0x80)
6217                 c = BYTE8_TO_CHAR (c);
6218               coding->charbuf[coding->charbuf_used++] = c;
6219             }
6220           produce_chars (coding, Qnil, 1);
6221         }
6222       else
6223         {
6224           /* Record unprocessed bytes in coding->carryover.  We are
6225              sure that the number of data is less than the size of
6226              coding->carryover.  */
6227           unsigned char *p = coding->carryover;
6228
6229           coding->carryover_bytes = nbytes;
6230           while (nbytes-- > 0)
6231             *p++ = *src++;
6232         }
6233       coding->consumed = coding->src_bytes;
6234     }
6235
6236   if (BUFFERP (coding->dst_object))
6237     {
6238       current_buffer->undo_list = undo_list;
6239       record_insert (coding->dst_pos, coding->produced_char);
6240     }
6241   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6242     decode_eol (coding);
6243   return coding->result;
6244 }
6245
6246
6247 /* Extract an annotation datum from a composition starting at POS and
6248    ending before LIMIT of CODING->src_object (buffer or string), store
6249    the data in BUF, set *STOP to a starting position of the next
6250    composition (if any) or to LIMIT, and return the address of the
6251    next element of BUF.
6252
6253    If such an annotation is not found, set *STOP to a starting
6254    position of a composition after POS (if any) or to LIMIT, and
6255    return BUF.  */
6256
6257 static INLINE int *
6258 handle_composition_annotation (pos, limit, coding, buf, stop)
6259      EMACS_INT pos, limit;
6260      struct coding_system *coding;
6261      int *buf;
6262      EMACS_INT *stop;
6263 {
6264   EMACS_INT start, end;
6265   Lisp_Object prop;
6266
6267   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6268       || end > limit)
6269     *stop = limit;
6270   else if (start > pos)
6271     *stop = start;
6272   else
6273     {
6274       if (start == pos)
6275         {
6276           /* We found a composition.  Store the corresponding
6277              annotation data in BUF.  */
6278           int *head = buf;
6279           enum composition_method method = COMPOSITION_METHOD (prop);
6280           int nchars = COMPOSITION_LENGTH (prop);
6281
6282           ADD_COMPOSITION_DATA (buf, nchars, method);
6283           if (method != COMPOSITION_RELATIVE)
6284             {
6285               Lisp_Object components;
6286               int len, i, i_byte;
6287
6288               components = COMPOSITION_COMPONENTS (prop);
6289               if (VECTORP (components))
6290                 {
6291                   len = XVECTOR (components)->size;
6292                   for (i = 0; i < len; i++)
6293                     *buf++ = XINT (AREF (components, i));
6294                 }
6295               else if (STRINGP (components))
6296                 {
6297                   len = SCHARS (components);
6298                   i = i_byte = 0;
6299                   while (i < len)
6300                     {
6301                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6302                       buf++;
6303                     }
6304                 }
6305               else if (INTEGERP (components))
6306                 {
6307                   len = 1;
6308                   *buf++ = XINT (components);
6309                 }
6310               else if (CONSP (components))
6311                 {
6312                   for (len = 0; CONSP (components);
6313                        len++, components = XCDR (components))
6314                     *buf++ = XINT (XCAR (components));
6315                 }
6316               else
6317                 abort ();
6318               *head -= len;
6319             }
6320         }
6321
6322       if (find_composition (end, limit, &start, &end, &prop,
6323                             coding->src_object)
6324           && end <= limit)
6325         *stop = start;
6326       else
6327         *stop = limit;
6328     }
6329   return buf;
6330 }
6331
6332
6333 /* Extract an annotation datum from a text property `charset' at POS of
6334    CODING->src_object (buffer of string), store the data in BUF, set
6335    *STOP to the position where the value of `charset' property changes
6336    (limiting by LIMIT), and return the address of the next element of
6337    BUF.
6338
6339    If the property value is nil, set *STOP to the position where the
6340    property value is non-nil (limiting by LIMIT), and return BUF.  */
6341
6342 static INLINE int *
6343 handle_charset_annotation (pos, limit, coding, buf, stop)
6344      EMACS_INT pos, limit;
6345      struct coding_system *coding;
6346      int *buf;
6347      EMACS_INT *stop;
6348 {
6349   Lisp_Object val, next;
6350   int id;
6351
6352   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6353   if (! NILP (val) && CHARSETP (val))
6354     id = XINT (CHARSET_SYMBOL_ID (val));
6355   else
6356     id = -1;
6357   ADD_CHARSET_DATA (buf, 0, id);
6358   next = Fnext_single_property_change (make_number (pos), Qcharset,
6359                                        coding->src_object,
6360                                        make_number (limit));
6361   *stop = XINT (next);
6362   return buf;
6363 }
6364
6365
6366 static void
6367 consume_chars (coding, translation_table, max_lookup)
6368      struct coding_system *coding;
6369      Lisp_Object translation_table;
6370      int max_lookup;
6371 {
6372   int *buf = coding->charbuf;
6373   int *buf_end = coding->charbuf + coding->charbuf_size;
6374   const unsigned char *src = coding->source + coding->consumed;
6375   const unsigned char *src_end = coding->source + coding->src_bytes;
6376   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6377   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6378   int multibytep = coding->src_multibyte;
6379   Lisp_Object eol_type;
6380   int c;
6381   EMACS_INT stop, stop_composition, stop_charset;
6382   int *lookup_buf = NULL;
6383
6384   if (! NILP (translation_table))
6385     lookup_buf = alloca (sizeof (int) * max_lookup);
6386
6387   eol_type = CODING_ID_EOL_TYPE (coding->id);
6388   if (VECTORP (eol_type))
6389     eol_type = Qunix;
6390
6391   /* Note: composition handling is not yet implemented.  */
6392   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6393
6394   if (NILP (coding->src_object))
6395     stop = stop_composition = stop_charset = end_pos;
6396   else
6397     {
6398       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6399         stop = stop_composition = pos;
6400       else
6401         stop = stop_composition = end_pos;
6402       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6403         stop = stop_charset = pos;
6404       else
6405         stop_charset = end_pos;
6406     }
6407
6408   /* Compensate for CRLF and conversion.  */
6409   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6410   while (buf < buf_end)
6411     {
6412       Lisp_Object trans;
6413
6414       if (pos == stop)
6415         {
6416           if (pos == end_pos)
6417             break;
6418           if (pos == stop_composition)
6419             buf = handle_composition_annotation (pos, end_pos, coding,
6420                                                  buf, &stop_composition);
6421           if (pos == stop_charset)
6422             buf = handle_charset_annotation (pos, end_pos, coding,
6423                                              buf, &stop_charset);
6424           stop = (stop_composition < stop_charset
6425                   ? stop_composition : stop_charset);
6426         }
6427
6428       if (! multibytep)
6429         {
6430           EMACS_INT bytes;
6431
6432           if (coding->encoder == encode_coding_raw_text)
6433             c = *src++, pos++;
6434           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6435             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6436           else
6437             c = BYTE8_TO_CHAR (*src), src++, pos++;
6438         }
6439       else
6440         c = STRING_CHAR_ADVANCE (src), pos++;
6441       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6442         c = '\n';
6443       if (! EQ (eol_type, Qunix))
6444         {
6445           if (c == '\n')
6446             {
6447               if (EQ (eol_type, Qdos))
6448                 *buf++ = '\r';
6449               else
6450                 c = '\r';
6451             }
6452         }
6453
6454       trans = Qnil;
6455       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6456       if (NILP (trans))
6457         *buf++ = c;
6458       else
6459         {
6460           int from_nchars = 1, to_nchars = 1;
6461           int *lookup_buf_end;
6462           const unsigned char *p = src;
6463           int i;
6464
6465           lookup_buf[0] = c;
6466           for (i = 1; i < max_lookup && p < src_end; i++)
6467             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6468           lookup_buf_end = lookup_buf + i;
6469           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6470                                    &from_nchars, &to_nchars);
6471           if (EQ (trans, Qt)
6472               || buf + to_nchars > buf_end)
6473             break;
6474           *buf++ = *lookup_buf;
6475           for (i = 1; i < to_nchars; i++)
6476             *buf++ = XINT (AREF (trans, i));
6477           for (i = 1; i < from_nchars; i++, pos++)
6478             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6479         }
6480     }
6481
6482   coding->consumed = src - coding->source;
6483   coding->consumed_char = pos - coding->src_pos;
6484   coding->charbuf_used = buf - coding->charbuf;
6485   coding->chars_at_source = 0;
6486 }
6487
6488
6489 /* Encode the text at CODING->src_object into CODING->dst_object.
6490    CODING->src_object is a buffer or a string.
6491    CODING->dst_object is a buffer or nil.
6492
6493    If CODING->src_object is a buffer, it must be the current buffer.
6494    In this case, if CODING->src_pos is positive, it is a position of
6495    the source text in the buffer, otherwise. the source text is in the
6496    gap area of the buffer, and coding->src_pos specifies the offset of
6497    the text from GPT (which must be the same as PT).  If this is the
6498    same buffer as CODING->dst_object, CODING->src_pos must be
6499    negative and CODING should not have `pre-write-conversion'.
6500
6501    If CODING->src_object is a string, CODING should not have
6502    `pre-write-conversion'.
6503
6504    If CODING->dst_object is a buffer, the encoded data is inserted at
6505    the current point of that buffer.
6506
6507    If CODING->dst_object is nil, the encoded data is placed at the
6508    memory area specified by CODING->destination.  */
6509
6510 static int
6511 encode_coding (coding)
6512      struct coding_system *coding;
6513 {
6514   Lisp_Object attrs;
6515   Lisp_Object translation_table;
6516   int max_lookup;
6517
6518   attrs = CODING_ID_ATTRS (coding->id);
6519   if (coding->encoder == encode_coding_raw_text)
6520     translation_table = Qnil, max_lookup = 0;
6521   else
6522     translation_table = get_translation_table (attrs, 1, &max_lookup);
6523
6524   if (BUFFERP (coding->dst_object))
6525     {
6526       set_buffer_internal (XBUFFER (coding->dst_object));
6527       coding->dst_multibyte
6528         = ! NILP (current_buffer->enable_multibyte_characters);
6529     }
6530
6531   coding->consumed = coding->consumed_char = 0;
6532   coding->produced = coding->produced_char = 0;
6533   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6534   coding->errors = 0;
6535
6536   ALLOC_CONVERSION_WORK_AREA (coding);
6537
6538   do {
6539     coding_set_source (coding);
6540     consume_chars (coding, translation_table, max_lookup);
6541     coding_set_destination (coding);
6542     (*(coding->encoder)) (coding);
6543   } while (coding->consumed_char < coding->src_chars);
6544
6545   if (BUFFERP (coding->dst_object))
6546     insert_from_gap (coding->produced_char, coding->produced);
6547
6548   return (coding->result);
6549 }
6550
6551
6552 /* Name (or base name) of work buffer for code conversion.  */
6553 static Lisp_Object Vcode_conversion_workbuf_name;
6554
6555 /* A working buffer used by the top level conversion.  Once it is
6556    created, it is never destroyed.  It has the name
6557    Vcode_conversion_workbuf_name.  The other working buffers are
6558    destroyed after the use is finished, and their names are modified
6559    versions of Vcode_conversion_workbuf_name.  */
6560 static Lisp_Object Vcode_conversion_reused_workbuf;
6561
6562 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6563 static int reused_workbuf_in_use;
6564
6565
6566 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6567    multibyteness of returning buffer.  */
6568
6569 static Lisp_Object
6570 make_conversion_work_buffer (multibyte)
6571      int multibyte;
6572 {
6573   Lisp_Object name, workbuf;
6574   struct buffer *current;
6575
6576   if (reused_workbuf_in_use++)
6577     {
6578       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6579       workbuf = Fget_buffer_create (name);
6580     }
6581   else
6582     {
6583       name = Vcode_conversion_workbuf_name;
6584       workbuf = Fget_buffer_create (name);
6585       if (NILP (Vcode_conversion_reused_workbuf))
6586         Vcode_conversion_reused_workbuf = workbuf;
6587     }
6588   current = current_buffer;
6589   set_buffer_internal (XBUFFER (workbuf));
6590   Ferase_buffer ();
6591   current_buffer->undo_list = Qt;
6592   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6593   set_buffer_internal (current);
6594   return workbuf;
6595 }
6596
6597
6598 static Lisp_Object
6599 code_conversion_restore (arg)
6600      Lisp_Object arg;
6601 {
6602   Lisp_Object current, workbuf;
6603   struct gcpro gcpro1;
6604
6605   GCPRO1 (arg);
6606   current = XCAR (arg);
6607   workbuf = XCDR (arg);
6608   if (! NILP (workbuf))
6609     {
6610       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6611         reused_workbuf_in_use = 0;
6612       else if (! NILP (Fbuffer_live_p (workbuf)))
6613         Fkill_buffer (workbuf);
6614     }
6615   set_buffer_internal (XBUFFER (current));
6616   UNGCPRO;
6617   return Qnil;
6618 }
6619
6620 Lisp_Object
6621 code_conversion_save (with_work_buf, multibyte)
6622      int with_work_buf, multibyte;
6623 {
6624   Lisp_Object workbuf = Qnil;
6625
6626   if (with_work_buf)
6627     workbuf = make_conversion_work_buffer (multibyte);
6628   record_unwind_protect (code_conversion_restore,
6629                          Fcons (Fcurrent_buffer (), workbuf));
6630   return workbuf;
6631 }
6632
6633 int
6634 decode_coding_gap (coding, chars, bytes)
6635      struct coding_system *coding;
6636      EMACS_INT chars, bytes;
6637 {
6638   int count = specpdl_ptr - specpdl;
6639   Lisp_Object attrs;
6640
6641   code_conversion_save (0, 0);
6642
6643   coding->src_object = Fcurrent_buffer ();
6644   coding->src_chars = chars;
6645   coding->src_bytes = bytes;
6646   coding->src_pos = -chars;
6647   coding->src_pos_byte = -bytes;
6648   coding->src_multibyte = chars < bytes;
6649   coding->dst_object = coding->src_object;
6650   coding->dst_pos = PT;
6651   coding->dst_pos_byte = PT_BYTE;
6652   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6653   coding->mode |= CODING_MODE_LAST_BLOCK;
6654
6655   if (CODING_REQUIRE_DETECTION (coding))
6656     detect_coding (coding);
6657
6658   decode_coding (coding);
6659
6660   attrs = CODING_ID_ATTRS (coding->id);
6661   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6662     {
6663       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6664       Lisp_Object val;
6665
6666       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6667       val = call1 (CODING_ATTR_POST_READ (attrs),
6668                    make_number (coding->produced_char));
6669       CHECK_NATNUM (val);
6670       coding->produced_char += Z - prev_Z;
6671       coding->produced += Z_BYTE - prev_Z_BYTE;
6672     }
6673
6674   unbind_to (count, Qnil);
6675   return coding->result;
6676 }
6677
6678 int
6679 encode_coding_gap (coding, chars, bytes)
6680      struct coding_system *coding;
6681      EMACS_INT chars, bytes;
6682 {
6683   int count = specpdl_ptr - specpdl;
6684
6685   code_conversion_save (0, 0);
6686
6687   coding->src_object = Fcurrent_buffer ();
6688   coding->src_chars = chars;
6689   coding->src_bytes = bytes;
6690   coding->src_pos = -chars;
6691   coding->src_pos_byte = -bytes;
6692   coding->src_multibyte = chars < bytes;
6693   coding->dst_object = coding->src_object;
6694   coding->dst_pos = PT;
6695   coding->dst_pos_byte = PT_BYTE;
6696
6697   encode_coding (coding);
6698
6699   unbind_to (count, Qnil);
6700   return coding->result;
6701 }
6702
6703
6704 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6705    SRC_OBJECT into DST_OBJECT by coding context CODING.
6706
6707    SRC_OBJECT is a buffer, a string, or Qnil.
6708
6709    If it is a buffer, the text is at point of the buffer.  FROM and TO
6710    are positions in the buffer.
6711
6712    If it is a string, the text is at the beginning of the string.
6713    FROM and TO are indices to the string.
6714
6715    If it is nil, the text is at coding->source.  FROM and TO are
6716    indices to coding->source.
6717
6718    DST_OBJECT is a buffer, Qt, or Qnil.
6719
6720    If it is a buffer, the decoded text is inserted at point of the
6721    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6722    is deleted.
6723
6724    If it is Qt, a string is made from the decoded text, and
6725    set in CODING->dst_object.
6726
6727    If it is Qnil, the decoded text is stored at CODING->destination.
6728    The caller must allocate CODING->dst_bytes bytes at
6729    CODING->destination by xmalloc.  If the decoded text is longer than
6730    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6731  */
6732
6733 void
6734 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6735                       dst_object)
6736      struct coding_system *coding;
6737      Lisp_Object src_object;
6738      EMACS_INT from, from_byte, to, to_byte;
6739      Lisp_Object dst_object;
6740 {
6741   int count = specpdl_ptr - specpdl;
6742   unsigned char *destination;
6743   EMACS_INT dst_bytes;
6744   EMACS_INT chars = to - from;
6745   EMACS_INT bytes = to_byte - from_byte;
6746   Lisp_Object attrs;
6747   Lisp_Object buffer;
6748   int saved_pt = -1, saved_pt_byte;
6749
6750   buffer = Fcurrent_buffer ();
6751
6752   if (NILP (dst_object))
6753     {
6754       destination = coding->destination;
6755       dst_bytes = coding->dst_bytes;
6756     }
6757
6758   coding->src_object = src_object;
6759   coding->src_chars = chars;
6760   coding->src_bytes = bytes;
6761   coding->src_multibyte = chars < bytes;
6762
6763   if (STRINGP (src_object))
6764     {
6765       coding->src_pos = from;
6766       coding->src_pos_byte = from_byte;
6767     }
6768   else if (BUFFERP (src_object))
6769     {
6770       set_buffer_internal (XBUFFER (src_object));
6771       if (from != GPT)
6772         move_gap_both (from, from_byte);
6773       if (EQ (src_object, dst_object))
6774         {
6775           saved_pt = PT, saved_pt_byte = PT_BYTE;
6776           TEMP_SET_PT_BOTH (from, from_byte);
6777           del_range_both (from, from_byte, to, to_byte, 1);
6778           coding->src_pos = -chars;
6779           coding->src_pos_byte = -bytes;
6780         }
6781       else
6782         {
6783           coding->src_pos = from;
6784           coding->src_pos_byte = from_byte;
6785         }
6786     }
6787
6788   if (CODING_REQUIRE_DETECTION (coding))
6789     detect_coding (coding);
6790   attrs = CODING_ID_ATTRS (coding->id);
6791
6792   if (EQ (dst_object, Qt)
6793       || (! NILP (CODING_ATTR_POST_READ (attrs))
6794           && NILP (dst_object)))
6795     {
6796       coding->dst_object = code_conversion_save (1, 1);
6797       coding->dst_pos = BEG;
6798       coding->dst_pos_byte = BEG_BYTE;
6799       coding->dst_multibyte = 1;
6800     }
6801   else if (BUFFERP (dst_object))
6802     {
6803       code_conversion_save (0, 0);
6804       coding->dst_object = dst_object;
6805       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6806       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6807       coding->dst_multibyte
6808         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6809     }
6810   else
6811     {
6812       code_conversion_save (0, 0);
6813       coding->dst_object = Qnil;
6814       coding->dst_multibyte = 1;
6815     }
6816
6817   decode_coding (coding);
6818
6819   if (BUFFERP (coding->dst_object))
6820     set_buffer_internal (XBUFFER (coding->dst_object));
6821
6822   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6823     {
6824       struct gcpro gcpro1, gcpro2;
6825       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6826       Lisp_Object val;
6827
6828       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6829       GCPRO2 (coding->src_object, coding->dst_object);
6830       val = safe_call1 (CODING_ATTR_POST_READ (attrs),
6831                         make_number (coding->produced_char));
6832       UNGCPRO;
6833       CHECK_NATNUM (val);
6834       coding->produced_char += Z - prev_Z;
6835       coding->produced += Z_BYTE - prev_Z_BYTE;
6836     }
6837
6838   if (EQ (dst_object, Qt))
6839     {
6840       coding->dst_object = Fbuffer_string ();
6841     }
6842   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6843     {
6844       set_buffer_internal (XBUFFER (coding->dst_object));
6845       if (dst_bytes < coding->produced)
6846         {
6847           destination
6848             = (unsigned char *) xrealloc (destination, coding->produced);
6849           if (! destination)
6850             {
6851               record_conversion_result (coding,
6852                                         CODING_RESULT_INSUFFICIENT_DST);
6853               unbind_to (count, Qnil);
6854               return;
6855             }
6856           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6857             move_gap_both (BEGV, BEGV_BYTE);
6858           bcopy (BEGV_ADDR, destination, coding->produced);
6859           coding->destination = destination;
6860         }
6861     }
6862
6863   if (saved_pt >= 0)
6864     {
6865       /* This is the case of:
6866          (BUFFERP (src_object) && EQ (src_object, dst_object))
6867          As we have moved PT while replacing the original buffer
6868          contents, we must recover it now.  */
6869       set_buffer_internal (XBUFFER (src_object));
6870       if (saved_pt < from)
6871         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6872       else if (saved_pt < from + chars)
6873         TEMP_SET_PT_BOTH (from, from_byte);
6874       else if (! NILP (current_buffer->enable_multibyte_characters))
6875         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6876                           saved_pt_byte + (coding->produced - bytes));
6877       else
6878         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6879                           saved_pt_byte + (coding->produced - bytes));
6880     }
6881
6882   unbind_to (count, coding->dst_object);
6883 }
6884
6885
6886 void
6887 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6888                       dst_object)
6889      struct coding_system *coding;
6890      Lisp_Object src_object;
6891      EMACS_INT from, from_byte, to, to_byte;
6892      Lisp_Object dst_object;
6893 {
6894   int count = specpdl_ptr - specpdl;
6895   EMACS_INT chars = to - from;
6896   EMACS_INT bytes = to_byte - from_byte;
6897   Lisp_Object attrs;
6898   Lisp_Object buffer;
6899   int saved_pt = -1, saved_pt_byte;
6900   int kill_src_buffer = 0;
6901
6902   buffer = Fcurrent_buffer ();
6903
6904   coding->src_object = src_object;
6905   coding->src_chars = chars;
6906   coding->src_bytes = bytes;
6907   coding->src_multibyte = chars < bytes;
6908
6909   attrs = CODING_ID_ATTRS (coding->id);
6910
6911   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6912     {
6913       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6914       set_buffer_internal (XBUFFER (coding->src_object));
6915       if (STRINGP (src_object))
6916         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6917       else if (BUFFERP (src_object))
6918         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6919       else
6920         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6921
6922       if (EQ (src_object, dst_object))
6923         {
6924           set_buffer_internal (XBUFFER (src_object));
6925           saved_pt = PT, saved_pt_byte = PT_BYTE;
6926           del_range_both (from, from_byte, to, to_byte, 1);
6927           set_buffer_internal (XBUFFER (coding->src_object));
6928         }
6929
6930       {
6931         Lisp_Object args[3];
6932
6933         args[0] = CODING_ATTR_PRE_WRITE (attrs);
6934         args[1] = make_number (BEG);
6935         args[2] = make_number (Z);
6936         safe_call (3, args);
6937       }
6938       if (XBUFFER (coding->src_object) != current_buffer)
6939         kill_src_buffer = 1;
6940       coding->src_object = Fcurrent_buffer ();
6941       if (BEG != GPT)
6942         move_gap_both (BEG, BEG_BYTE);
6943       coding->src_chars = Z - BEG;
6944       coding->src_bytes = Z_BYTE - BEG_BYTE;
6945       coding->src_pos = BEG;
6946       coding->src_pos_byte = BEG_BYTE;
6947       coding->src_multibyte = Z < Z_BYTE;
6948     }
6949   else if (STRINGP (src_object))
6950     {
6951       code_conversion_save (0, 0);
6952       coding->src_pos = from;
6953       coding->src_pos_byte = from_byte;
6954     }
6955   else if (BUFFERP (src_object))
6956     {
6957       code_conversion_save (0, 0);
6958       set_buffer_internal (XBUFFER (src_object));
6959       if (EQ (src_object, dst_object))
6960         {
6961           saved_pt = PT, saved_pt_byte = PT_BYTE;
6962           coding->src_object = del_range_1 (from, to, 1, 1);
6963           coding->src_pos = 0;
6964           coding->src_pos_byte = 0;
6965         }
6966       else
6967         {
6968           if (from < GPT && to >= GPT)
6969             move_gap_both (from, from_byte);
6970           coding->src_pos = from;
6971           coding->src_pos_byte = from_byte;
6972         }
6973     }
6974   else
6975     code_conversion_save (0, 0);
6976
6977   if (BUFFERP (dst_object))
6978     {
6979       coding->dst_object = dst_object;
6980       if (EQ (src_object, dst_object))
6981         {
6982           coding->dst_pos = from;
6983           coding->dst_pos_byte = from_byte;
6984         }
6985       else
6986         {
6987           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6988           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6989         }
6990       coding->dst_multibyte
6991         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6992     }
6993   else if (EQ (dst_object, Qt))
6994     {
6995       coding->dst_object = Qnil;
6996       coding->dst_bytes = coding->src_chars;
6997       if (coding->dst_bytes == 0)
6998         coding->dst_bytes = 1;
6999       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
7000       coding->dst_multibyte = 0;
7001     }
7002   else
7003     {
7004       coding->dst_object = Qnil;
7005       coding->dst_multibyte = 0;
7006     }
7007
7008   encode_coding (coding);
7009
7010   if (EQ (dst_object, Qt))
7011     {
7012       if (BUFFERP (coding->dst_object))
7013         coding->dst_object = Fbuffer_string ();
7014       else
7015         {
7016           coding->dst_object
7017             = make_unibyte_string ((char *) coding->destination,
7018                                    coding->produced);
7019           xfree (coding->destination);
7020         }
7021     }
7022
7023   if (saved_pt >= 0)
7024     {
7025       /* This is the case of:
7026          (BUFFERP (src_object) && EQ (src_object, dst_object))
7027          As we have moved PT while replacing the original buffer
7028          contents, we must recover it now.  */
7029       set_buffer_internal (XBUFFER (src_object));
7030       if (saved_pt < from)
7031         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
7032       else if (saved_pt < from + chars)
7033         TEMP_SET_PT_BOTH (from, from_byte);
7034       else if (! NILP (current_buffer->enable_multibyte_characters))
7035         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
7036                           saved_pt_byte + (coding->produced - bytes));
7037       else
7038         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
7039                           saved_pt_byte + (coding->produced - bytes));
7040     }
7041
7042   if (kill_src_buffer)
7043     Fkill_buffer (coding->src_object);
7044   unbind_to (count, Qnil);
7045 }
7046
7047
7048 Lisp_Object
7049 preferred_coding_system ()
7050 {
7051   int id = coding_categories[coding_priorities[0]].id;
7052
7053   return CODING_ID_NAME (id);
7054 }
7055
7056 \f
7057 #ifdef emacs
7058 /*** 8. Emacs Lisp library functions ***/
7059
7060 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7061        doc: /* Return t if OBJECT is nil or a coding-system.
7062 See the documentation of `define-coding-system' for information
7063 about coding-system objects.  */)
7064      (obj)
7065      Lisp_Object obj;
7066 {
7067   if (NILP (obj)
7068       || CODING_SYSTEM_ID (obj) >= 0)
7069     return Qt;
7070   if (! SYMBOLP (obj)
7071       || NILP (Fget (obj, Qcoding_system_define_form)))
7072     return Qnil;
7073   return Qt;
7074 }
7075
7076 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7077        Sread_non_nil_coding_system, 1, 1, 0,
7078        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7079      (prompt)
7080      Lisp_Object prompt;
7081 {
7082   Lisp_Object val;
7083   do
7084     {
7085       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7086                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7087     }
7088   while (SCHARS (val) == 0);
7089   return (Fintern (val, Qnil));
7090 }
7091
7092 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7093        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7094 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7095      (prompt, default_coding_system)
7096      Lisp_Object prompt, default_coding_system;
7097 {
7098   Lisp_Object val;
7099   if (SYMBOLP (default_coding_system))
7100     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7101   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7102                           Qt, Qnil, Qcoding_system_history,
7103                           default_coding_system, Qnil);
7104   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7105 }
7106
7107 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7108        1, 1, 0,
7109        doc: /* Check validity of CODING-SYSTEM.
7110 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7111 It is valid if it is nil or a symbol defined as a coding system by the
7112 function `define-coding-system'.  */)
7113   (coding_system)
7114      Lisp_Object coding_system;
7115 {
7116   Lisp_Object define_form;
7117
7118   define_form = Fget (coding_system, Qcoding_system_define_form);
7119   if (! NILP (define_form))
7120     {
7121       Fput (coding_system, Qcoding_system_define_form, Qnil);
7122       safe_eval (define_form);
7123     }
7124   if (!NILP (Fcoding_system_p (coding_system)))
7125     return coding_system;
7126   while (1)
7127     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7128 }
7129
7130 \f
7131 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7132    HIGHEST is nonzero, return the coding system of the highest
7133    priority among the detected coding systems.  Otherwize return a
7134    list of detected coding systems sorted by their priorities.  If
7135    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7136    multibyte form but contains only ASCII and eight-bit chars.
7137    Otherwise, the bytes are raw bytes.
7138
7139    CODING-SYSTEM controls the detection as below:
7140
7141    If it is nil, detect both text-format and eol-format.  If the
7142    text-format part of CODING-SYSTEM is already specified
7143    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7144    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7145    detect only text-format.  */
7146
7147 Lisp_Object
7148 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7149                       coding_system)
7150      const unsigned char *src;
7151      int src_chars, src_bytes, highest;
7152      int multibytep;
7153      Lisp_Object coding_system;
7154 {
7155   const unsigned char *src_end = src + src_bytes;
7156   Lisp_Object attrs, eol_type;
7157   Lisp_Object val;
7158   struct coding_system coding;
7159   int id;
7160   struct coding_detection_info detect_info;
7161   enum coding_category base_category;
7162
7163   if (NILP (coding_system))
7164     coding_system = Qundecided;
7165   setup_coding_system (coding_system, &coding);
7166   attrs = CODING_ID_ATTRS (coding.id);
7167   eol_type = CODING_ID_EOL_TYPE (coding.id);
7168   coding_system = CODING_ATTR_BASE_NAME (attrs);
7169
7170   coding.source = src;
7171   coding.src_chars = src_chars;
7172   coding.src_bytes = src_bytes;
7173   coding.src_multibyte = multibytep;
7174   coding.consumed = 0;
7175   coding.mode |= CODING_MODE_LAST_BLOCK;
7176
7177   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7178
7179   /* At first, detect text-format if necessary.  */
7180   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7181   if (base_category == coding_category_undecided)
7182     {
7183       enum coding_category category;
7184       struct coding_system *this;
7185       int c, i;
7186
7187       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7188       for (i = 0; src < src_end; i++, src++)
7189         {
7190           c = *src;
7191           if (c & 0x80)
7192             break;
7193           if (c < 0x20
7194               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7195               && inhibit_iso_escape_detection)
7196             {
7197               coding.head_ascii = src - coding.source;
7198               if (detect_coding_iso_2022 (&coding, &detect_info))
7199                 {
7200                   /* We have scanned the whole data.  */
7201                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7202                     /* We didn't find an 8-bit code.  */
7203                     src = src_end;
7204                   break;
7205                 }
7206             }
7207         }
7208       coding.head_ascii = src - coding.source;
7209
7210       if (src < src_end
7211           || detect_info.found)
7212         {
7213           if (src == src_end)
7214             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7215             for (i = 0; i < coding_category_raw_text; i++)
7216               {
7217                 category = coding_priorities[i];
7218                 if (detect_info.found & (1 << category))
7219                   break;
7220               }
7221           else
7222             for (i = 0; i < coding_category_raw_text; i++)
7223               {
7224                 category = coding_priorities[i];
7225                 this = coding_categories + category;
7226
7227                 if (this->id < 0)
7228                   {
7229                     /* No coding system of this category is defined.  */
7230                     detect_info.rejected |= (1 << category);
7231                   }
7232                 else if (category >= coding_category_raw_text)
7233                   continue;
7234                 else if (detect_info.checked & (1 << category))
7235                   {
7236                     if (highest
7237                         && (detect_info.found & (1 << category)))
7238                       break;
7239                   }
7240                 else
7241                   {
7242                     if ((*(this->detector)) (&coding, &detect_info)
7243                         && highest
7244                         && (detect_info.found & (1 << category)))
7245                       {
7246                         if (category == coding_category_utf_16_auto)
7247                           {
7248                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7249                               category = coding_category_utf_16_le;
7250                             else
7251                               category = coding_category_utf_16_be;
7252                           }
7253                         break;
7254                       }
7255                   }
7256               }
7257         }
7258
7259       if (detect_info.rejected == CATEGORY_MASK_ANY)
7260         {
7261           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7262           id = coding_categories[coding_category_raw_text].id;
7263           val = Fcons (make_number (id), Qnil);
7264         }
7265       else if (! detect_info.rejected && ! detect_info.found)
7266         {
7267           detect_info.found = CATEGORY_MASK_ANY;
7268           id = coding_categories[coding_category_undecided].id;
7269           val = Fcons (make_number (id), Qnil);
7270         }
7271       else if (highest)
7272         {
7273           if (detect_info.found)
7274             {
7275               detect_info.found = 1 << category;
7276               val = Fcons (make_number (this->id), Qnil);
7277             }
7278           else
7279             for (i = 0; i < coding_category_raw_text; i++)
7280               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7281                 {
7282                   detect_info.found = 1 << coding_priorities[i];
7283                   id = coding_categories[coding_priorities[i]].id;
7284                   val = Fcons (make_number (id), Qnil);
7285                   break;
7286                 }
7287         }
7288       else
7289         {
7290           int mask = detect_info.rejected | detect_info.found;
7291           int found = 0;
7292           val = Qnil;
7293
7294           for (i = coding_category_raw_text - 1; i >= 0; i--)
7295             {
7296               category = coding_priorities[i];
7297               if (! (mask & (1 << category)))
7298                 {
7299                   found |= 1 << category;
7300                   id = coding_categories[category].id;
7301                   val = Fcons (make_number (id), val);
7302                 }
7303             }
7304           for (i = coding_category_raw_text - 1; i >= 0; i--)
7305             {
7306               category = coding_priorities[i];
7307               if (detect_info.found & (1 << category))
7308                 {
7309                   id = coding_categories[category].id;
7310                   val = Fcons (make_number (id), val);
7311                 }
7312             }
7313           detect_info.found |= found;
7314         }
7315     }
7316   else if (base_category == coding_category_utf_16_auto)
7317     {
7318       if (detect_coding_utf_16 (&coding, &detect_info))
7319         {
7320           struct coding_system *this;
7321
7322           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7323             this = coding_categories + coding_category_utf_16_le;
7324           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7325             this = coding_categories + coding_category_utf_16_be;
7326           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7327             this = coding_categories + coding_category_utf_16_be_nosig;
7328           else
7329             this = coding_categories + coding_category_utf_16_le_nosig;
7330           val = Fcons (make_number (this->id), Qnil);
7331         }
7332     }
7333   else
7334     {
7335       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7336       val = Fcons (make_number (coding.id), Qnil);
7337     }
7338
7339   /* Then, detect eol-format if necessary.  */
7340   {
7341     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7342     Lisp_Object tail;
7343
7344     if (VECTORP (eol_type))
7345       {
7346         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7347           normal_eol = detect_eol (coding.source, src_bytes,
7348                                    coding_category_raw_text);
7349         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7350                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7351           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7352                                       coding_category_utf_16_be);
7353         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7354                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7355           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7356                                       coding_category_utf_16_le);
7357       }
7358     else
7359       {
7360         if (EQ (eol_type, Qunix))
7361           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7362         else if (EQ (eol_type, Qdos))
7363           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7364         else
7365           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7366       }
7367
7368     for (tail = val; CONSP (tail); tail = XCDR (tail))
7369       {
7370         enum coding_category category;
7371         int this_eol;
7372
7373         id = XINT (XCAR (tail));
7374         attrs = CODING_ID_ATTRS (id);
7375         category = XINT (CODING_ATTR_CATEGORY (attrs));
7376         eol_type = CODING_ID_EOL_TYPE (id);
7377         if (VECTORP (eol_type))
7378           {
7379             if (category == coding_category_utf_16_be
7380                 || category == coding_category_utf_16_be_nosig)
7381               this_eol = utf_16_be_eol;
7382             else if (category == coding_category_utf_16_le
7383                      || category == coding_category_utf_16_le_nosig)
7384               this_eol = utf_16_le_eol;
7385             else
7386               this_eol = normal_eol;
7387
7388             if (this_eol == EOL_SEEN_LF)
7389               XSETCAR (tail, AREF (eol_type, 0));
7390             else if (this_eol == EOL_SEEN_CRLF)
7391               XSETCAR (tail, AREF (eol_type, 1));
7392             else if (this_eol == EOL_SEEN_CR)
7393               XSETCAR (tail, AREF (eol_type, 2));
7394             else
7395               XSETCAR (tail, CODING_ID_NAME (id));
7396           }
7397         else
7398           XSETCAR (tail, CODING_ID_NAME (id));
7399       }
7400   }
7401
7402   return (highest ? XCAR (val) : val);
7403 }
7404
7405
7406 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7407        2, 3, 0,
7408        doc: /* Detect coding system of the text in the region between START and END.
7409 Return a list of possible coding systems ordered by priority.
7410
7411 If only ASCII characters are found, it returns a list of single element
7412 `undecided' or its subsidiary coding system according to a detected
7413 end-of-line format.
7414
7415 If optional argument HIGHEST is non-nil, return the coding system of
7416 highest priority.  */)
7417      (start, end, highest)
7418      Lisp_Object start, end, highest;
7419 {
7420   int from, to;
7421   int from_byte, to_byte;
7422
7423   CHECK_NUMBER_COERCE_MARKER (start);
7424   CHECK_NUMBER_COERCE_MARKER (end);
7425
7426   validate_region (&start, &end);
7427   from = XINT (start), to = XINT (end);
7428   from_byte = CHAR_TO_BYTE (from);
7429   to_byte = CHAR_TO_BYTE (to);
7430
7431   if (from < GPT && to >= GPT)
7432     move_gap_both (to, to_byte);
7433
7434   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7435                                to - from, to_byte - from_byte,
7436                                !NILP (highest),
7437                                !NILP (current_buffer
7438                                       ->enable_multibyte_characters),
7439                                Qnil);
7440 }
7441
7442 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7443        1, 2, 0,
7444        doc: /* Detect coding system of the text in STRING.
7445 Return a list of possible coding systems ordered by priority.
7446
7447 If only ASCII characters are found, it returns a list of single element
7448 `undecided' or its subsidiary coding system according to a detected
7449 end-of-line format.
7450
7451 If optional argument HIGHEST is non-nil, return the coding system of
7452 highest priority.  */)
7453      (string, highest)
7454      Lisp_Object string, highest;
7455 {
7456   CHECK_STRING (string);
7457
7458   return detect_coding_system (SDATA (string),
7459                                SCHARS (string), SBYTES (string),
7460                                !NILP (highest), STRING_MULTIBYTE (string),
7461                                Qnil);
7462 }
7463
7464
7465 static INLINE int
7466 char_encodable_p (c, attrs)
7467      int c;
7468      Lisp_Object attrs;
7469 {
7470   Lisp_Object tail;
7471   struct charset *charset;
7472   Lisp_Object translation_table;
7473
7474   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7475   if (! NILP (translation_table))
7476     c = translate_char (translation_table, c);
7477   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7478        CONSP (tail); tail = XCDR (tail))
7479     {
7480       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7481       if (CHAR_CHARSET_P (c, charset))
7482         break;
7483     }
7484   return (! NILP (tail));
7485 }
7486
7487
7488 /* Return a list of coding systems that safely encode the text between
7489    START and END.  If EXCLUDE is non-nil, it is a list of coding
7490    systems not to check.  The returned list doesn't contain any such
7491    coding systems.  In any case, if the text contains only ASCII or is
7492    unibyte, return t.  */
7493
7494 DEFUN ("find-coding-systems-region-internal",
7495        Ffind_coding_systems_region_internal,
7496        Sfind_coding_systems_region_internal, 2, 3, 0,
7497        doc: /* Internal use only.  */)
7498      (start, end, exclude)
7499      Lisp_Object start, end, exclude;
7500 {
7501   Lisp_Object coding_attrs_list, safe_codings;
7502   EMACS_INT start_byte, end_byte;
7503   const unsigned char *p, *pbeg, *pend;
7504   int c;
7505   Lisp_Object tail, elt;
7506
7507   if (STRINGP (start))
7508     {
7509       if (!STRING_MULTIBYTE (start)
7510           || SCHARS (start) == SBYTES (start))
7511         return Qt;
7512       start_byte = 0;
7513       end_byte = SBYTES (start);
7514     }
7515   else
7516     {
7517       CHECK_NUMBER_COERCE_MARKER (start);
7518       CHECK_NUMBER_COERCE_MARKER (end);
7519       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7520         args_out_of_range (start, end);
7521       if (NILP (current_buffer->enable_multibyte_characters))
7522         return Qt;
7523       start_byte = CHAR_TO_BYTE (XINT (start));
7524       end_byte = CHAR_TO_BYTE (XINT (end));
7525       if (XINT (end) - XINT (start) == end_byte - start_byte)
7526         return Qt;
7527
7528       if (XINT (start) < GPT && XINT (end) > GPT)
7529         {
7530           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7531             move_gap_both (XINT (start), start_byte);
7532           else
7533             move_gap_both (XINT (end), end_byte);
7534         }
7535     }
7536
7537   coding_attrs_list = Qnil;
7538   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7539     if (NILP (exclude)
7540         || NILP (Fmemq (XCAR (tail), exclude)))
7541       {
7542         Lisp_Object attrs;
7543
7544         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7545         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7546             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7547           {
7548             ASET (attrs, coding_attr_trans_tbl,
7549                   get_translation_table (attrs, 1, NULL));
7550             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7551           }
7552       }
7553
7554   if (STRINGP (start))
7555     p = pbeg = SDATA (start);
7556   else
7557     p = pbeg = BYTE_POS_ADDR (start_byte);
7558   pend = p + (end_byte - start_byte);
7559
7560   while (p < pend && ASCII_BYTE_P (*p)) p++;
7561   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7562
7563   while (p < pend)
7564     {
7565       if (ASCII_BYTE_P (*p))
7566         p++;
7567       else
7568         {
7569           c = STRING_CHAR_ADVANCE (p);
7570
7571           charset_map_loaded = 0;
7572           for (tail = coding_attrs_list; CONSP (tail);)
7573             {
7574               elt = XCAR (tail);
7575               if (NILP (elt))
7576                 tail = XCDR (tail);
7577               else if (char_encodable_p (c, elt))
7578                 tail = XCDR (tail);
7579               else if (CONSP (XCDR (tail)))
7580                 {
7581                   XSETCAR (tail, XCAR (XCDR (tail)));
7582                   XSETCDR (tail, XCDR (XCDR (tail)));
7583                 }
7584               else
7585                 {
7586                   XSETCAR (tail, Qnil);
7587                   tail = XCDR (tail);
7588                 }
7589             }
7590           if (charset_map_loaded)
7591             {
7592               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7593
7594               if (STRINGP (start))
7595                 pbeg = SDATA (start);
7596               else
7597                 pbeg = BYTE_POS_ADDR (start_byte);
7598               p = pbeg + p_offset;
7599               pend = pbeg + pend_offset;
7600             }
7601         }
7602     }
7603
7604   safe_codings = list2 (Qraw_text, Qno_conversion);
7605   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7606     if (! NILP (XCAR (tail)))
7607       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7608
7609   return safe_codings;
7610 }
7611
7612
7613 DEFUN ("unencodable-char-position", Funencodable_char_position,
7614        Sunencodable_char_position, 3, 5, 0,
7615        doc: /*
7616 Return position of first un-encodable character in a region.
7617 START and END specfiy the region and CODING-SYSTEM specifies the
7618 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7619
7620 If optional 4th argument COUNT is non-nil, it specifies at most how
7621 many un-encodable characters to search.  In this case, the value is a
7622 list of positions.
7623
7624 If optional 5th argument STRING is non-nil, it is a string to search
7625 for un-encodable characters.  In that case, START and END are indexes
7626 to the string.  */)
7627      (start, end, coding_system, count, string)
7628      Lisp_Object start, end, coding_system, count, string;
7629 {
7630   int n;
7631   struct coding_system coding;
7632   Lisp_Object attrs, charset_list, translation_table;
7633   Lisp_Object positions;
7634   int from, to;
7635   const unsigned char *p, *stop, *pend;
7636   int ascii_compatible;
7637
7638   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7639   attrs = CODING_ID_ATTRS (coding.id);
7640   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7641     return Qnil;
7642   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7643   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7644   translation_table = get_translation_table (attrs, 1, NULL);
7645
7646   if (NILP (string))
7647     {
7648       validate_region (&start, &end);
7649       from = XINT (start);
7650       to = XINT (end);
7651       if (NILP (current_buffer->enable_multibyte_characters)
7652           || (ascii_compatible
7653               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7654         return Qnil;
7655       p = CHAR_POS_ADDR (from);
7656       pend = CHAR_POS_ADDR (to);
7657       if (from < GPT && to >= GPT)
7658         stop = GPT_ADDR;
7659       else
7660         stop = pend;
7661     }
7662   else
7663     {
7664       CHECK_STRING (string);
7665       CHECK_NATNUM (start);
7666       CHECK_NATNUM (end);
7667       from = XINT (start);
7668       to = XINT (end);
7669       if (from > to
7670           || to > SCHARS (string))
7671         args_out_of_range_3 (string, start, end);
7672       if (! STRING_MULTIBYTE (string))
7673         return Qnil;
7674       p = SDATA (string) + string_char_to_byte (string, from);
7675       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7676       if (ascii_compatible && (to - from) == (pend - p))
7677         return Qnil;
7678     }
7679
7680   if (NILP (count))
7681     n = 1;
7682   else
7683     {
7684       CHECK_NATNUM (count);
7685       n = XINT (count);
7686     }
7687
7688   positions = Qnil;
7689   while (1)
7690     {
7691       int c;
7692
7693       if (ascii_compatible)
7694         while (p < stop && ASCII_BYTE_P (*p))
7695           p++, from++;
7696       if (p >= stop)
7697         {
7698           if (p >= pend)
7699             break;
7700           stop = pend;
7701           p = GAP_END_ADDR;
7702         }
7703
7704       c = STRING_CHAR_ADVANCE (p);
7705       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7706           && ! char_charset (translate_char (translation_table, c),
7707                              charset_list, NULL))
7708         {
7709           positions = Fcons (make_number (from), positions);
7710           n--;
7711           if (n == 0)
7712             break;
7713         }
7714
7715       from++;
7716     }
7717
7718   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7719 }
7720
7721
7722 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7723        Scheck_coding_systems_region, 3, 3, 0,
7724        doc: /* Check if the region is encodable by coding systems.
7725
7726 START and END are buffer positions specifying the region.
7727 CODING-SYSTEM-LIST is a list of coding systems to check.
7728
7729 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7730 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7731 whole region, POS0, POS1, ... are buffer positions where non-encodable
7732 characters are found.
7733
7734 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7735 value is nil.
7736
7737 START may be a string.  In that case, check if the string is
7738 encodable, and the value contains indices to the string instead of
7739 buffer positions.  END is ignored.  */)
7740      (start, end, coding_system_list)
7741      Lisp_Object start, end, coding_system_list;
7742 {
7743   Lisp_Object list;
7744   EMACS_INT start_byte, end_byte;
7745   int pos;
7746   const unsigned char *p, *pbeg, *pend;
7747   int c;
7748   Lisp_Object tail, elt, attrs;
7749
7750   if (STRINGP (start))
7751     {
7752       if (!STRING_MULTIBYTE (start)
7753           && SCHARS (start) != SBYTES (start))
7754         return Qnil;
7755       start_byte = 0;
7756       end_byte = SBYTES (start);
7757       pos = 0;
7758     }
7759   else
7760     {
7761       CHECK_NUMBER_COERCE_MARKER (start);
7762       CHECK_NUMBER_COERCE_MARKER (end);
7763       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7764         args_out_of_range (start, end);
7765       if (NILP (current_buffer->enable_multibyte_characters))
7766         return Qnil;
7767       start_byte = CHAR_TO_BYTE (XINT (start));
7768       end_byte = CHAR_TO_BYTE (XINT (end));
7769       if (XINT (end) - XINT (start) == end_byte - start_byte)
7770         return Qt;
7771
7772       if (XINT (start) < GPT && XINT (end) > GPT)
7773         {
7774           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7775             move_gap_both (XINT (start), start_byte);
7776           else
7777             move_gap_both (XINT (end), end_byte);
7778         }
7779       pos = XINT (start);
7780     }
7781
7782   list = Qnil;
7783   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7784     {
7785       elt = XCAR (tail);
7786       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7787       ASET (attrs, coding_attr_trans_tbl,
7788             get_translation_table (attrs, 1, NULL));
7789       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7790     }
7791
7792   if (STRINGP (start))
7793     p = pbeg = SDATA (start);
7794   else
7795     p = pbeg = BYTE_POS_ADDR (start_byte);
7796   pend = p + (end_byte - start_byte);
7797
7798   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7799   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7800
7801   while (p < pend)
7802     {
7803       if (ASCII_BYTE_P (*p))
7804         p++;
7805       else
7806         {
7807           c = STRING_CHAR_ADVANCE (p);
7808
7809           charset_map_loaded = 0;
7810           for (tail = list; CONSP (tail); tail = XCDR (tail))
7811             {
7812               elt = XCDR (XCAR (tail));
7813               if (! char_encodable_p (c, XCAR (elt)))
7814                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7815             }
7816           if (charset_map_loaded)
7817             {
7818               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7819
7820               if (STRINGP (start))
7821                 pbeg = SDATA (start);
7822               else
7823                 pbeg = BYTE_POS_ADDR (start_byte);
7824               p = pbeg + p_offset;
7825               pend = pbeg + pend_offset;
7826             }
7827         }
7828       pos++;
7829     }
7830
7831   tail = list;
7832   list = Qnil;
7833   for (; CONSP (tail); tail = XCDR (tail))
7834     {
7835       elt = XCAR (tail);
7836       if (CONSP (XCDR (XCDR (elt))))
7837         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7838                       list);
7839     }
7840
7841   return list;
7842 }
7843
7844
7845 Lisp_Object
7846 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7847      Lisp_Object start, end, coding_system, dst_object;
7848      int encodep, norecord;
7849 {
7850   struct coding_system coding;
7851   EMACS_INT from, from_byte, to, to_byte;
7852   Lisp_Object src_object;
7853
7854   CHECK_NUMBER_COERCE_MARKER (start);
7855   CHECK_NUMBER_COERCE_MARKER (end);
7856   if (NILP (coding_system))
7857     coding_system = Qno_conversion;
7858   else
7859     CHECK_CODING_SYSTEM (coding_system);
7860   src_object = Fcurrent_buffer ();
7861   if (NILP (dst_object))
7862     dst_object = src_object;
7863   else if (! EQ (dst_object, Qt))
7864     CHECK_BUFFER (dst_object);
7865
7866   validate_region (&start, &end);
7867   from = XFASTINT (start);
7868   from_byte = CHAR_TO_BYTE (from);
7869   to = XFASTINT (end);
7870   to_byte = CHAR_TO_BYTE (to);
7871
7872   setup_coding_system (coding_system, &coding);
7873   coding.mode |= CODING_MODE_LAST_BLOCK;
7874
7875   if (encodep)
7876     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7877                           dst_object);
7878   else
7879     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7880                           dst_object);
7881   if (! norecord)
7882     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7883
7884   return (BUFFERP (dst_object)
7885           ? make_number (coding.produced_char)
7886           : coding.dst_object);
7887 }
7888
7889
7890 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7891        3, 4, "r\nzCoding system: ",
7892        doc: /* Decode the current region from the specified coding system.
7893 When called from a program, takes four arguments:
7894         START, END, CODING-SYSTEM, and DESTINATION.
7895 START and END are buffer positions.
7896
7897 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7898 If nil, the region between START and END is replace by the decoded text.
7899 If buffer, the decoded text is inserted in the buffer.
7900 If t, the decoded text is returned.
7901
7902 This function sets `last-coding-system-used' to the precise coding system
7903 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7904 not fully specified.)
7905 It returns the length of the decoded text.  */)
7906      (start, end, coding_system, destination)
7907      Lisp_Object start, end, coding_system, destination;
7908 {
7909   return code_convert_region (start, end, coding_system, destination, 0, 0);
7910 }
7911
7912 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7913        3, 4, "r\nzCoding system: ",
7914        doc: /* Encode the current region by specified coding system.
7915 When called from a program, takes three arguments:
7916 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7917
7918 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7919 If nil, the region between START and END is replace by the encoded text.
7920 If buffer, the encoded text is inserted in the buffer.
7921 If t, the encoded text is returned.
7922
7923 This function sets `last-coding-system-used' to the precise coding system
7924 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7925 not fully specified.)
7926 It returns the length of the encoded text.  */)
7927   (start, end, coding_system, destination)
7928      Lisp_Object start, end, coding_system, destination;
7929 {
7930   return code_convert_region (start, end, coding_system, destination, 1, 0);
7931 }
7932
7933 Lisp_Object
7934 code_convert_string (string, coding_system, dst_object,
7935                      encodep, nocopy, norecord)
7936      Lisp_Object string, coding_system, dst_object;
7937      int encodep, nocopy, norecord;
7938 {
7939   struct coding_system coding;
7940   EMACS_INT chars, bytes;
7941
7942   CHECK_STRING (string);
7943   if (NILP (coding_system))
7944     {
7945       if (! norecord)
7946         Vlast_coding_system_used = Qno_conversion;
7947       if (NILP (dst_object))
7948         return (nocopy ? Fcopy_sequence (string) : string);
7949     }
7950
7951   if (NILP (coding_system))
7952     coding_system = Qno_conversion;
7953   else
7954     CHECK_CODING_SYSTEM (coding_system);
7955   if (NILP (dst_object))
7956     dst_object = Qt;
7957   else if (! EQ (dst_object, Qt))
7958     CHECK_BUFFER (dst_object);
7959
7960   setup_coding_system (coding_system, &coding);
7961   coding.mode |= CODING_MODE_LAST_BLOCK;
7962   chars = SCHARS (string);
7963   bytes = SBYTES (string);
7964   if (encodep)
7965     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7966   else
7967     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7968   if (! norecord)
7969     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7970
7971   return (BUFFERP (dst_object)
7972           ? make_number (coding.produced_char)
7973           : coding.dst_object);
7974 }
7975
7976
7977 /* Encode or decode STRING according to CODING_SYSTEM.
7978    Do not set Vlast_coding_system_used.
7979
7980    This function is called only from macros DECODE_FILE and
7981    ENCODE_FILE, thus we ignore character composition.  */
7982
7983 Lisp_Object
7984 code_convert_string_norecord (string, coding_system, encodep)
7985      Lisp_Object string, coding_system;
7986      int encodep;
7987 {
7988   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7989 }
7990
7991
7992 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7993        2, 4, 0,
7994        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7995
7996 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7997 if the decoding operation is trivial.
7998
7999 Optional fourth arg BUFFER non-nil meant that the decoded text is
8000 inserted in BUFFER instead of returned as a string.  In this case,
8001 the return value is BUFFER.
8002
8003 This function sets `last-coding-system-used' to the precise coding system
8004 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8005 not fully specified.  */)
8006   (string, coding_system, nocopy, buffer)
8007      Lisp_Object string, coding_system, nocopy, buffer;
8008 {
8009   return code_convert_string (string, coding_system, buffer,
8010                               0, ! NILP (nocopy), 0);
8011 }
8012
8013 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
8014        2, 4, 0,
8015        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
8016
8017 Optional third arg NOCOPY non-nil means it is OK to return STRING
8018 itself if the encoding operation is trivial.
8019
8020 Optional fourth arg BUFFER non-nil meant that the encoded text is
8021 inserted in BUFFER instead of returned as a string.  In this case,
8022 the return value is BUFFER.
8023
8024 This function sets `last-coding-system-used' to the precise coding system
8025 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
8026 not fully specified.)  */)
8027      (string, coding_system, nocopy, buffer)
8028      Lisp_Object string, coding_system, nocopy, buffer;
8029 {
8030   return code_convert_string (string, coding_system, buffer,
8031                               1, ! NILP (nocopy), 1);
8032 }
8033
8034 \f
8035 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
8036        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
8037 Return the corresponding character.  */)
8038      (code)
8039      Lisp_Object code;
8040 {
8041   Lisp_Object spec, attrs, val;
8042   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
8043   int c;
8044
8045   CHECK_NATNUM (code);
8046   c = XFASTINT (code);
8047   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8048   attrs = AREF (spec, 0);
8049
8050   if (ASCII_BYTE_P (c)
8051       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8052     return code;
8053
8054   val = CODING_ATTR_CHARSET_LIST (attrs);
8055   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8056   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8057   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
8058
8059   if (c <= 0x7F)
8060     charset = charset_roman;
8061   else if (c >= 0xA0 && c < 0xDF)
8062     {
8063       charset = charset_kana;
8064       c -= 0x80;
8065     }
8066   else
8067     {
8068       int s1 = c >> 8, s2 = c & 0xFF;
8069
8070       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8071           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8072         error ("Invalid code: %d", code);
8073       SJIS_TO_JIS (c);
8074       charset = charset_kanji;
8075     }
8076   c = DECODE_CHAR (charset, c);
8077   if (c < 0)
8078     error ("Invalid code: %d", code);
8079   return make_number (c);
8080 }
8081
8082
8083 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8084        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8085 Return the corresponding code in SJIS.  */)
8086      (ch)
8087     Lisp_Object ch;
8088 {
8089   Lisp_Object spec, attrs, charset_list;
8090   int c;
8091   struct charset *charset;
8092   unsigned code;
8093
8094   CHECK_CHARACTER (ch);
8095   c = XFASTINT (ch);
8096   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8097   attrs = AREF (spec, 0);
8098
8099   if (ASCII_CHAR_P (c)
8100       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8101     return ch;
8102
8103   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8104   charset = char_charset (c, charset_list, &code);
8105   if (code == CHARSET_INVALID_CODE (charset))
8106     error ("Can't encode by shift_jis encoding: %d", c);
8107   JIS_TO_SJIS (code);
8108
8109   return make_number (code);
8110 }
8111
8112 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8113        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8114 Return the corresponding character.  */)
8115      (code)
8116      Lisp_Object code;
8117 {
8118   Lisp_Object spec, attrs, val;
8119   struct charset *charset_roman, *charset_big5, *charset;
8120   int c;
8121
8122   CHECK_NATNUM (code);
8123   c = XFASTINT (code);
8124   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8125   attrs = AREF (spec, 0);
8126
8127   if (ASCII_BYTE_P (c)
8128       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8129     return code;
8130
8131   val = CODING_ATTR_CHARSET_LIST (attrs);
8132   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8133   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8134
8135   if (c <= 0x7F)
8136     charset = charset_roman;
8137   else
8138     {
8139       int b1 = c >> 8, b2 = c & 0x7F;
8140       if (b1 < 0xA1 || b1 > 0xFE
8141           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8142         error ("Invalid code: %d", code);
8143       charset = charset_big5;
8144     }
8145   c = DECODE_CHAR (charset, (unsigned )c);
8146   if (c < 0)
8147     error ("Invalid code: %d", code);
8148   return make_number (c);
8149 }
8150
8151 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8152        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8153 Return the corresponding character code in Big5.  */)
8154      (ch)
8155      Lisp_Object ch;
8156 {
8157   Lisp_Object spec, attrs, charset_list;
8158   struct charset *charset;
8159   int c;
8160   unsigned code;
8161
8162   CHECK_CHARACTER (ch);
8163   c = XFASTINT (ch);
8164   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8165   attrs = AREF (spec, 0);
8166   if (ASCII_CHAR_P (c)
8167       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8168     return ch;
8169
8170   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8171   charset = char_charset (c, charset_list, &code);
8172   if (code == CHARSET_INVALID_CODE (charset))
8173     error ("Can't encode by Big5 encoding: %d", c);
8174
8175   return make_number (code);
8176 }
8177
8178 \f
8179 DEFUN ("set-terminal-coding-system-internal",
8180        Fset_terminal_coding_system_internal,
8181        Sset_terminal_coding_system_internal, 1, 1, 0,
8182        doc: /* Internal use only.  */)
8183      (coding_system)
8184      Lisp_Object coding_system;
8185 {
8186   CHECK_SYMBOL (coding_system);
8187   setup_coding_system (Fcheck_coding_system (coding_system),
8188                         &terminal_coding);
8189
8190   /* We had better not send unsafe characters to terminal.  */
8191   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8192   /* Characer composition should be disabled.  */
8193   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8194   terminal_coding.src_multibyte = 1;
8195   terminal_coding.dst_multibyte = 0;
8196   return Qnil;
8197 }
8198
8199 DEFUN ("set-safe-terminal-coding-system-internal",
8200        Fset_safe_terminal_coding_system_internal,
8201        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8202        doc: /* Internal use only.  */)
8203      (coding_system)
8204      Lisp_Object coding_system;
8205 {
8206   CHECK_SYMBOL (coding_system);
8207   setup_coding_system (Fcheck_coding_system (coding_system),
8208                        &safe_terminal_coding);
8209   /* Characer composition should be disabled.  */
8210   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8211   safe_terminal_coding.src_multibyte = 1;
8212   safe_terminal_coding.dst_multibyte = 0;
8213   return Qnil;
8214 }
8215
8216 DEFUN ("terminal-coding-system",
8217        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8218        doc: /* Return coding system specified for terminal output.  */)
8219      ()
8220 {
8221   Lisp_Object coding_system;
8222
8223   coding_system = CODING_ID_NAME (terminal_coding.id);
8224   /* For backward compatibility, return nil if it is `undecided'. */
8225   return (coding_system != Qundecided ? coding_system : Qnil);
8226 }
8227
8228 DEFUN ("set-keyboard-coding-system-internal",
8229        Fset_keyboard_coding_system_internal,
8230        Sset_keyboard_coding_system_internal, 1, 1, 0,
8231        doc: /* Internal use only.  */)
8232      (coding_system)
8233      Lisp_Object coding_system;
8234 {
8235   CHECK_SYMBOL (coding_system);
8236   setup_coding_system (Fcheck_coding_system (coding_system),
8237                        &keyboard_coding);
8238   /* Characer composition should be disabled.  */
8239   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8240   return Qnil;
8241 }
8242
8243 DEFUN ("keyboard-coding-system",
8244        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8245        doc: /* Return coding system specified for decoding keyboard input.  */)
8246      ()
8247 {
8248   return CODING_ID_NAME (keyboard_coding.id);
8249 }
8250
8251 \f
8252 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8253        Sfind_operation_coding_system,  1, MANY, 0,
8254        doc: /* Choose a coding system for an operation based on the target name.
8255 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8256 DECODING-SYSTEM is the coding system to use for decoding
8257 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8258 for encoding (in case OPERATION does encoding).
8259
8260 The first argument OPERATION specifies an I/O primitive:
8261   For file I/O, `insert-file-contents' or `write-region'.
8262   For process I/O, `call-process', `call-process-region', or `start-process'.
8263   For network I/O, `open-network-stream'.
8264
8265 The remaining arguments should be the same arguments that were passed
8266 to the primitive.  Depending on which primitive, one of those arguments
8267 is selected as the TARGET.  For example, if OPERATION does file I/O,
8268 whichever argument specifies the file name is TARGET.
8269
8270 TARGET has a meaning which depends on OPERATION:
8271   For file I/O, TARGET is a file name.
8272   For process I/O, TARGET is a process name.
8273   For network I/O, TARGET is a service name or a port number
8274
8275 This function looks up what specified for TARGET in,
8276 `file-coding-system-alist', `process-coding-system-alist',
8277 or `network-coding-system-alist' depending on OPERATION.
8278 They may specify a coding system, a cons of coding systems,
8279 or a function symbol to call.
8280 In the last case, we call the function with one argument,
8281 which is a list of all the arguments given to this function.
8282
8283 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8284      (nargs, args)
8285      int nargs;
8286      Lisp_Object *args;
8287 {
8288   Lisp_Object operation, target_idx, target, val;
8289   register Lisp_Object chain;
8290
8291   if (nargs < 2)
8292     error ("Too few arguments");
8293   operation = args[0];
8294   if (!SYMBOLP (operation)
8295       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8296     error ("Invalid first arguement");
8297   if (nargs < 1 + XINT (target_idx))
8298     error ("Too few arguments for operation: %s",
8299            SDATA (SYMBOL_NAME (operation)));
8300   target = args[XINT (target_idx) + 1];
8301   if (!(STRINGP (target)
8302         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8303     error ("Invalid %dth argument", XINT (target_idx) + 1);
8304
8305   chain = ((EQ (operation, Qinsert_file_contents)
8306             || EQ (operation, Qwrite_region))
8307            ? Vfile_coding_system_alist
8308            : (EQ (operation, Qopen_network_stream)
8309               ? Vnetwork_coding_system_alist
8310               : Vprocess_coding_system_alist));
8311   if (NILP (chain))
8312     return Qnil;
8313
8314   for (; CONSP (chain); chain = XCDR (chain))
8315     {
8316       Lisp_Object elt;
8317
8318       elt = XCAR (chain);
8319       if (CONSP (elt)
8320           && ((STRINGP (target)
8321                && STRINGP (XCAR (elt))
8322                && fast_string_match (XCAR (elt), target) >= 0)
8323               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8324         {
8325           val = XCDR (elt);
8326           /* Here, if VAL is both a valid coding system and a valid
8327              function symbol, we return VAL as a coding system.  */
8328           if (CONSP (val))
8329             return val;
8330           if (! SYMBOLP (val))
8331             return Qnil;
8332           if (! NILP (Fcoding_system_p (val)))
8333             return Fcons (val, val);
8334           if (! NILP (Ffboundp (val)))
8335             {
8336               val = call1 (val, Flist (nargs, args));
8337               if (CONSP (val))
8338                 return val;
8339               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8340                 return Fcons (val, val);
8341             }
8342           return Qnil;
8343         }
8344     }
8345   return Qnil;
8346 }
8347
8348 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8349        Sset_coding_system_priority, 0, MANY, 0,
8350        doc: /* Assign higher priority to the coding systems given as arguments.
8351 If multiple coding systems belongs to the same category,
8352 all but the first one are ignored.
8353
8354 usage: (set-coding-system-priority ...)  */)
8355      (nargs, args)
8356      int nargs;
8357      Lisp_Object *args;
8358 {
8359   int i, j;
8360   int changed[coding_category_max];
8361   enum coding_category priorities[coding_category_max];
8362
8363   bzero (changed, sizeof changed);
8364
8365   for (i = j = 0; i < nargs; i++)
8366     {
8367       enum coding_category category;
8368       Lisp_Object spec, attrs;
8369
8370       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8371       attrs = AREF (spec, 0);
8372       category = XINT (CODING_ATTR_CATEGORY (attrs));
8373       if (changed[category])
8374         /* Ignore this coding system because a coding system of the
8375            same category already had a higher priority.  */
8376         continue;
8377       changed[category] = 1;
8378       priorities[j++] = category;
8379       if (coding_categories[category].id >= 0
8380           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8381         setup_coding_system (args[i], &coding_categories[category]);
8382       Fset (AREF (Vcoding_category_table, category), args[i]);
8383     }
8384
8385   /* Now we have decided top J priorities.  Reflect the order of the
8386      original priorities to the remaining priorities.  */
8387
8388   for (i = j, j = 0; i < coding_category_max; i++, j++)
8389     {
8390       while (j < coding_category_max
8391              && changed[coding_priorities[j]])
8392         j++;
8393       if (j == coding_category_max)
8394         abort ();
8395       priorities[i] = coding_priorities[j];
8396     }
8397
8398   bcopy (priorities, coding_priorities, sizeof priorities);
8399
8400   /* Update `coding-category-list'.  */
8401   Vcoding_category_list = Qnil;
8402   for (i = coding_category_max - 1; i >= 0; i--)
8403     Vcoding_category_list
8404       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8405                Vcoding_category_list);
8406
8407   return Qnil;
8408 }
8409
8410 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8411        Scoding_system_priority_list, 0, 1, 0,
8412        doc: /* Return a list of coding systems ordered by their priorities.
8413 HIGHESTP non-nil means just return the highest priority one.  */)
8414      (highestp)
8415      Lisp_Object highestp;
8416 {
8417   int i;
8418   Lisp_Object val;
8419
8420   for (i = 0, val = Qnil; i < coding_category_max; i++)
8421     {
8422       enum coding_category category = coding_priorities[i];
8423       int id = coding_categories[category].id;
8424       Lisp_Object attrs;
8425
8426       if (id < 0)
8427         continue;
8428       attrs = CODING_ID_ATTRS (id);
8429       if (! NILP (highestp))
8430         return CODING_ATTR_BASE_NAME (attrs);
8431       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8432     }
8433   return Fnreverse (val);
8434 }
8435
8436 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8437
8438 static Lisp_Object
8439 make_subsidiaries (base)
8440      Lisp_Object base;
8441 {
8442   Lisp_Object subsidiaries;
8443   int base_name_len = SBYTES (SYMBOL_NAME (base));
8444   char *buf = (char *) alloca (base_name_len + 6);
8445   int i;
8446
8447   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8448   subsidiaries = Fmake_vector (make_number (3), Qnil);
8449   for (i = 0; i < 3; i++)
8450     {
8451       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8452       ASET (subsidiaries, i, intern (buf));
8453     }
8454   return subsidiaries;
8455 }
8456
8457
8458 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8459        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8460        doc: /* For internal use only.
8461 usage: (define-coding-system-internal ...)  */)
8462      (nargs, args)
8463      int nargs;
8464      Lisp_Object *args;
8465 {
8466   Lisp_Object name;
8467   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8468   Lisp_Object attrs;            /* Vector of attributes.  */
8469   Lisp_Object eol_type;
8470   Lisp_Object aliases;
8471   Lisp_Object coding_type, charset_list, safe_charsets;
8472   enum coding_category category;
8473   Lisp_Object tail, val;
8474   int max_charset_id = 0;
8475   int i;
8476
8477   if (nargs < coding_arg_max)
8478     goto short_args;
8479
8480   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8481
8482   name = args[coding_arg_name];
8483   CHECK_SYMBOL (name);
8484   CODING_ATTR_BASE_NAME (attrs) = name;
8485
8486   val = args[coding_arg_mnemonic];
8487   if (! STRINGP (val))
8488     CHECK_CHARACTER (val);
8489   CODING_ATTR_MNEMONIC (attrs) = val;
8490
8491   coding_type = args[coding_arg_coding_type];
8492   CHECK_SYMBOL (coding_type);
8493   CODING_ATTR_TYPE (attrs) = coding_type;
8494
8495   charset_list = args[coding_arg_charset_list];
8496   if (SYMBOLP (charset_list))
8497     {
8498       if (EQ (charset_list, Qiso_2022))
8499         {
8500           if (! EQ (coding_type, Qiso_2022))
8501             error ("Invalid charset-list");
8502           charset_list = Viso_2022_charset_list;
8503         }
8504       else if (EQ (charset_list, Qemacs_mule))
8505         {
8506           if (! EQ (coding_type, Qemacs_mule))
8507             error ("Invalid charset-list");
8508           charset_list = Vemacs_mule_charset_list;
8509         }
8510       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8511         if (max_charset_id < XFASTINT (XCAR (tail)))
8512           max_charset_id = XFASTINT (XCAR (tail));
8513     }
8514   else
8515     {
8516       charset_list = Fcopy_sequence (charset_list);
8517       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8518         {
8519           struct charset *charset;
8520
8521           val = Fcar (tail);
8522           CHECK_CHARSET_GET_CHARSET (val, charset);
8523           if (EQ (coding_type, Qiso_2022)
8524               ? CHARSET_ISO_FINAL (charset) < 0
8525               : EQ (coding_type, Qemacs_mule)
8526               ? CHARSET_EMACS_MULE_ID (charset) < 0
8527               : 0)
8528             error ("Can't handle charset `%s'",
8529                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8530
8531           XSETCAR (tail, make_number (charset->id));
8532           if (max_charset_id < charset->id)
8533             max_charset_id = charset->id;
8534         }
8535     }
8536   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8537
8538   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8539                                 make_number (255));
8540   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8541     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8542   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8543
8544   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8545
8546   val = args[coding_arg_decode_translation_table];
8547   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8548     CHECK_SYMBOL (val);
8549   CODING_ATTR_DECODE_TBL (attrs) = val;
8550
8551   val = args[coding_arg_encode_translation_table];
8552   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8553     CHECK_SYMBOL (val);
8554   CODING_ATTR_ENCODE_TBL (attrs) = val;
8555
8556   val = args[coding_arg_post_read_conversion];
8557   CHECK_SYMBOL (val);
8558   CODING_ATTR_POST_READ (attrs) = val;
8559
8560   val = args[coding_arg_pre_write_conversion];
8561   CHECK_SYMBOL (val);
8562   CODING_ATTR_PRE_WRITE (attrs) = val;
8563
8564   val = args[coding_arg_default_char];
8565   if (NILP (val))
8566     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8567   else
8568     {
8569       CHECK_CHARACTER (val);
8570       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8571     }
8572
8573   val = args[coding_arg_for_unibyte];
8574   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8575
8576   val = args[coding_arg_plist];
8577   CHECK_LIST (val);
8578   CODING_ATTR_PLIST (attrs) = val;
8579
8580   if (EQ (coding_type, Qcharset))
8581     {
8582       /* Generate a lisp vector of 256 elements.  Each element is nil,
8583          integer, or a list of charset IDs.
8584
8585          If Nth element is nil, the byte code N is invalid in this
8586          coding system.
8587
8588          If Nth element is a number NUM, N is the first byte of a
8589          charset whose ID is NUM.
8590
8591          If Nth element is a list of charset IDs, N is the first byte
8592          of one of them.  The list is sorted by dimensions of the
8593          charsets.  A charset of smaller dimension comes firtst. */
8594       val = Fmake_vector (make_number (256), Qnil);
8595
8596       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8597         {
8598           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8599           int dim = CHARSET_DIMENSION (charset);
8600           int idx = (dim - 1) * 4;
8601
8602           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8603             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8604
8605           for (i = charset->code_space[idx];
8606                i <= charset->code_space[idx + 1]; i++)
8607             {
8608               Lisp_Object tmp, tmp2;
8609               int dim2;
8610
8611               tmp = AREF (val, i);
8612               if (NILP (tmp))
8613                 tmp = XCAR (tail);
8614               else if (NUMBERP (tmp))
8615                 {
8616                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8617                   if (dim < dim2)
8618                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8619                   else
8620                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8621                 }
8622               else
8623                 {
8624                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8625                     {
8626                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8627                       if (dim < dim2)
8628                         break;
8629                     }
8630                   if (NILP (tmp2))
8631                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8632                   else
8633                     {
8634                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8635                       XSETCAR (tmp2, XCAR (tail));
8636                     }
8637                 }
8638               ASET (val, i, tmp);
8639             }
8640         }
8641       ASET (attrs, coding_attr_charset_valids, val);
8642       category = coding_category_charset;
8643     }
8644   else if (EQ (coding_type, Qccl))
8645     {
8646       Lisp_Object valids;
8647
8648       if (nargs < coding_arg_ccl_max)
8649         goto short_args;
8650
8651       val = args[coding_arg_ccl_decoder];
8652       CHECK_CCL_PROGRAM (val);
8653       if (VECTORP (val))
8654         val = Fcopy_sequence (val);
8655       ASET (attrs, coding_attr_ccl_decoder, val);
8656
8657       val = args[coding_arg_ccl_encoder];
8658       CHECK_CCL_PROGRAM (val);
8659       if (VECTORP (val))
8660         val = Fcopy_sequence (val);
8661       ASET (attrs, coding_attr_ccl_encoder, val);
8662
8663       val = args[coding_arg_ccl_valids];
8664       valids = Fmake_string (make_number (256), make_number (0));
8665       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8666         {
8667           int from, to;
8668
8669           val = Fcar (tail);
8670           if (INTEGERP (val))
8671             {
8672               from = to = XINT (val);
8673               if (from < 0 || from > 255)
8674                 args_out_of_range_3 (val, make_number (0), make_number (255));
8675             }
8676           else
8677             {
8678               CHECK_CONS (val);
8679               CHECK_NATNUM_CAR (val);
8680               CHECK_NATNUM_CDR (val);
8681               from = XINT (XCAR (val));
8682               if (from > 255)
8683                 args_out_of_range_3 (XCAR (val),
8684                                      make_number (0), make_number (255));
8685               to = XINT (XCDR (val));
8686               if (to < from || to > 255)
8687                 args_out_of_range_3 (XCDR (val),
8688                                      XCAR (val), make_number (255));
8689             }
8690           for (i = from; i <= to; i++)
8691             SSET (valids, i, 1);
8692         }
8693       ASET (attrs, coding_attr_ccl_valids, valids);
8694
8695       category = coding_category_ccl;
8696     }
8697   else if (EQ (coding_type, Qutf_16))
8698     {
8699       Lisp_Object bom, endian;
8700
8701       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8702
8703       if (nargs < coding_arg_utf16_max)
8704         goto short_args;
8705
8706       bom = args[coding_arg_utf16_bom];
8707       if (! NILP (bom) && ! EQ (bom, Qt))
8708         {
8709           CHECK_CONS (bom);
8710           val = XCAR (bom);
8711           CHECK_CODING_SYSTEM (val);
8712           val = XCDR (bom);
8713           CHECK_CODING_SYSTEM (val);
8714         }
8715       ASET (attrs, coding_attr_utf_16_bom, bom);
8716
8717       endian = args[coding_arg_utf16_endian];
8718       CHECK_SYMBOL (endian);
8719       if (NILP (endian))
8720         endian = Qbig;
8721       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8722         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8723       ASET (attrs, coding_attr_utf_16_endian, endian);
8724
8725       category = (CONSP (bom)
8726                   ? coding_category_utf_16_auto
8727                   : NILP (bom)
8728                   ? (EQ (endian, Qbig)
8729                      ? coding_category_utf_16_be_nosig
8730                      : coding_category_utf_16_le_nosig)
8731                   : (EQ (endian, Qbig)
8732                      ? coding_category_utf_16_be
8733                      : coding_category_utf_16_le));
8734     }
8735   else if (EQ (coding_type, Qiso_2022))
8736     {
8737       Lisp_Object initial, reg_usage, request, flags;
8738       int i;
8739
8740       if (nargs < coding_arg_iso2022_max)
8741         goto short_args;
8742
8743       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8744       CHECK_VECTOR (initial);
8745       for (i = 0; i < 4; i++)
8746         {
8747           val = Faref (initial, make_number (i));
8748           if (! NILP (val))
8749             {
8750               struct charset *charset;
8751
8752               CHECK_CHARSET_GET_CHARSET (val, charset);
8753               ASET (initial, i, make_number (CHARSET_ID (charset)));
8754               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8755                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8756             }
8757           else
8758             ASET (initial, i, make_number (-1));
8759         }
8760
8761       reg_usage = args[coding_arg_iso2022_reg_usage];
8762       CHECK_CONS (reg_usage);
8763       CHECK_NUMBER_CAR (reg_usage);
8764       CHECK_NUMBER_CDR (reg_usage);
8765
8766       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8767       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8768         {
8769           int id;
8770           Lisp_Object tmp;
8771
8772           val = Fcar (tail);
8773           CHECK_CONS (val);
8774           tmp = XCAR (val);
8775           CHECK_CHARSET_GET_ID (tmp, id);
8776           CHECK_NATNUM_CDR (val);
8777           if (XINT (XCDR (val)) >= 4)
8778             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8779           XSETCAR (val, make_number (id));
8780         }
8781
8782       flags = args[coding_arg_iso2022_flags];
8783       CHECK_NATNUM (flags);
8784       i = XINT (flags);
8785       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8786         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8787
8788       ASET (attrs, coding_attr_iso_initial, initial);
8789       ASET (attrs, coding_attr_iso_usage, reg_usage);
8790       ASET (attrs, coding_attr_iso_request, request);
8791       ASET (attrs, coding_attr_iso_flags, flags);
8792       setup_iso_safe_charsets (attrs);
8793
8794       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8795         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8796                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8797                     ? coding_category_iso_7_else
8798                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8799                     ? coding_category_iso_7
8800                     : coding_category_iso_7_tight);
8801       else
8802         {
8803           int id = XINT (AREF (initial, 1));
8804
8805           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8806                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8807                        || id < 0)
8808                       ? coding_category_iso_8_else
8809                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8810                       ? coding_category_iso_8_1
8811                       : coding_category_iso_8_2);
8812         }
8813       if (category != coding_category_iso_8_1
8814           && category != coding_category_iso_8_2)
8815         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8816     }
8817   else if (EQ (coding_type, Qemacs_mule))
8818     {
8819       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8820         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8821       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8822       category = coding_category_emacs_mule;
8823     }
8824   else if (EQ (coding_type, Qshift_jis))
8825     {
8826
8827       struct charset *charset;
8828
8829       if (XINT (Flength (charset_list)) != 3
8830           && XINT (Flength (charset_list)) != 4)
8831         error ("There should be three or four charsets");
8832
8833       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8834       if (CHARSET_DIMENSION (charset) != 1)
8835         error ("Dimension of charset %s is not one",
8836                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8837       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8838         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8839
8840       charset_list = XCDR (charset_list);
8841       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8842       if (CHARSET_DIMENSION (charset) != 1)
8843         error ("Dimension of charset %s is not one",
8844                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8845
8846       charset_list = XCDR (charset_list);
8847       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8848       if (CHARSET_DIMENSION (charset) != 2)
8849         error ("Dimension of charset %s is not two",
8850                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8851
8852       charset_list = XCDR (charset_list);
8853       if (! NILP (charset_list))
8854         {
8855           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8856           if (CHARSET_DIMENSION (charset) != 2)
8857             error ("Dimension of charset %s is not two",
8858                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8859         }
8860
8861       category = coding_category_sjis;
8862       Vsjis_coding_system = name;
8863     }
8864   else if (EQ (coding_type, Qbig5))
8865     {
8866       struct charset *charset;
8867
8868       if (XINT (Flength (charset_list)) != 2)
8869         error ("There should be just two charsets");
8870
8871       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8872       if (CHARSET_DIMENSION (charset) != 1)
8873         error ("Dimension of charset %s is not one",
8874                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8875       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8876         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8877
8878       charset_list = XCDR (charset_list);
8879       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8880       if (CHARSET_DIMENSION (charset) != 2)
8881         error ("Dimension of charset %s is not two",
8882                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8883
8884       category = coding_category_big5;
8885       Vbig5_coding_system = name;
8886     }
8887   else if (EQ (coding_type, Qraw_text))
8888     {
8889       category = coding_category_raw_text;
8890       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8891     }
8892   else if (EQ (coding_type, Qutf_8))
8893     {
8894       category = coding_category_utf_8;
8895       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8896     }
8897   else if (EQ (coding_type, Qundecided))
8898     category = coding_category_undecided;
8899   else
8900     error ("Invalid coding system type: %s",
8901            SDATA (SYMBOL_NAME (coding_type)));
8902
8903   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8904   CODING_ATTR_PLIST (attrs)
8905     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8906                                 CODING_ATTR_PLIST (attrs)));
8907   CODING_ATTR_PLIST (attrs)
8908     = Fcons (QCascii_compatible_p,
8909              Fcons (CODING_ATTR_ASCII_COMPAT (attrs),
8910                     CODING_ATTR_PLIST (attrs)));
8911
8912   eol_type = args[coding_arg_eol_type];
8913   if (! NILP (eol_type)
8914       && ! EQ (eol_type, Qunix)
8915       && ! EQ (eol_type, Qdos)
8916       && ! EQ (eol_type, Qmac))
8917     error ("Invalid eol-type");
8918
8919   aliases = Fcons (name, Qnil);
8920
8921   if (NILP (eol_type))
8922     {
8923       eol_type = make_subsidiaries (name);
8924       for (i = 0; i < 3; i++)
8925         {
8926           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8927
8928           this_name = AREF (eol_type, i);
8929           this_aliases = Fcons (this_name, Qnil);
8930           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8931           this_spec = Fmake_vector (make_number (3), attrs);
8932           ASET (this_spec, 1, this_aliases);
8933           ASET (this_spec, 2, this_eol_type);
8934           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8935           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8936           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8937                                         Vcoding_system_alist);
8938         }
8939     }
8940
8941   spec_vec = Fmake_vector (make_number (3), attrs);
8942   ASET (spec_vec, 1, aliases);
8943   ASET (spec_vec, 2, eol_type);
8944
8945   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8946   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8947   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8948                                 Vcoding_system_alist);
8949
8950   {
8951     int id = coding_categories[category].id;
8952
8953     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8954       setup_coding_system (name, &coding_categories[category]);
8955   }
8956
8957   return Qnil;
8958
8959  short_args:
8960   return Fsignal (Qwrong_number_of_arguments,
8961                   Fcons (intern ("define-coding-system-internal"),
8962                          make_number (nargs)));
8963 }
8964
8965
8966 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8967        3, 3, 0,
8968        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8969   (coding_system, prop, val)
8970      Lisp_Object coding_system, prop, val;
8971 {
8972   Lisp_Object spec, attrs;
8973
8974   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8975   attrs = AREF (spec, 0);
8976   if (EQ (prop, QCmnemonic))
8977     {
8978       if (! STRINGP (val))
8979         CHECK_CHARACTER (val);
8980       CODING_ATTR_MNEMONIC (attrs) = val;
8981     }
8982   else if (EQ (prop, QCdefalut_char))
8983     {
8984       if (NILP (val))
8985         val = make_number (' ');
8986       else
8987         CHECK_CHARACTER (val);
8988       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8989     }
8990   else if (EQ (prop, QCdecode_translation_table))
8991     {
8992       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8993         CHECK_SYMBOL (val);
8994       CODING_ATTR_DECODE_TBL (attrs) = val;
8995     }
8996   else if (EQ (prop, QCencode_translation_table))
8997     {
8998       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8999         CHECK_SYMBOL (val);
9000       CODING_ATTR_ENCODE_TBL (attrs) = val;
9001     }
9002   else if (EQ (prop, QCpost_read_conversion))
9003     {
9004       CHECK_SYMBOL (val);
9005       CODING_ATTR_POST_READ (attrs) = val;
9006     }
9007   else if (EQ (prop, QCpre_write_conversion))
9008     {
9009       CHECK_SYMBOL (val);
9010       CODING_ATTR_PRE_WRITE (attrs) = val;
9011     }
9012   else if (EQ (prop, QCascii_compatible_p))
9013     {
9014       CODING_ATTR_ASCII_COMPAT (attrs) = val;
9015     }
9016
9017   CODING_ATTR_PLIST (attrs)
9018     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
9019   return val;
9020 }
9021
9022
9023 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
9024        Sdefine_coding_system_alias, 2, 2, 0,
9025        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
9026      (alias, coding_system)
9027      Lisp_Object alias, coding_system;
9028 {
9029   Lisp_Object spec, aliases, eol_type;
9030
9031   CHECK_SYMBOL (alias);
9032   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9033   aliases = AREF (spec, 1);
9034   /* ALISES should be a list of length more than zero, and the first
9035      element is a base coding system.  Append ALIAS at the tail of the
9036      list.  */
9037   while (!NILP (XCDR (aliases)))
9038     aliases = XCDR (aliases);
9039   XSETCDR (aliases, Fcons (alias, Qnil));
9040
9041   eol_type = AREF (spec, 2);
9042   if (VECTORP (eol_type))
9043     {
9044       Lisp_Object subsidiaries;
9045       int i;
9046
9047       subsidiaries = make_subsidiaries (alias);
9048       for (i = 0; i < 3; i++)
9049         Fdefine_coding_system_alias (AREF (subsidiaries, i),
9050                                      AREF (eol_type, i));
9051     }
9052
9053   Fputhash (alias, spec, Vcoding_system_hash_table);
9054   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
9055   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
9056                                 Vcoding_system_alist);
9057
9058   return Qnil;
9059 }
9060
9061 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
9062        1, 1, 0,
9063        doc: /* Return the base of CODING-SYSTEM.
9064 Any alias or subsidiary coding system is not a base coding system.  */)
9065   (coding_system)
9066      Lisp_Object coding_system;
9067 {
9068   Lisp_Object spec, attrs;
9069
9070   if (NILP (coding_system))
9071     return (Qno_conversion);
9072   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9073   attrs = AREF (spec, 0);
9074   return CODING_ATTR_BASE_NAME (attrs);
9075 }
9076
9077 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9078        1, 1, 0,
9079        doc: "Return the property list of CODING-SYSTEM.")
9080      (coding_system)
9081      Lisp_Object coding_system;
9082 {
9083   Lisp_Object spec, attrs;
9084
9085   if (NILP (coding_system))
9086     coding_system = Qno_conversion;
9087   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9088   attrs = AREF (spec, 0);
9089   return CODING_ATTR_PLIST (attrs);
9090 }
9091
9092
9093 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9094        1, 1, 0,
9095        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9096      (coding_system)
9097      Lisp_Object coding_system;
9098 {
9099   Lisp_Object spec;
9100
9101   if (NILP (coding_system))
9102     coding_system = Qno_conversion;
9103   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9104   return AREF (spec, 1);
9105 }
9106
9107 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9108        Scoding_system_eol_type, 1, 1, 0,
9109        doc: /* Return eol-type of CODING-SYSTEM.
9110 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9111
9112 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9113 and CR respectively.
9114
9115 A vector value indicates that a format of end-of-line should be
9116 detected automatically.  Nth element of the vector is the subsidiary
9117 coding system whose eol-type is N.  */)
9118      (coding_system)
9119      Lisp_Object coding_system;
9120 {
9121   Lisp_Object spec, eol_type;
9122   int n;
9123
9124   if (NILP (coding_system))
9125     coding_system = Qno_conversion;
9126   if (! CODING_SYSTEM_P (coding_system))
9127     return Qnil;
9128   spec = CODING_SYSTEM_SPEC (coding_system);
9129   eol_type = AREF (spec, 2);
9130   if (VECTORP (eol_type))
9131     return Fcopy_sequence (eol_type);
9132   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9133   return make_number (n);
9134 }
9135
9136 #endif /* emacs */
9137
9138 \f
9139 /*** 9. Post-amble ***/
9140
9141 void
9142 init_coding_once ()
9143 {
9144   int i;
9145
9146   for (i = 0; i < coding_category_max; i++)
9147     {
9148       coding_categories[i].id = -1;
9149       coding_priorities[i] = i;
9150     }
9151
9152   /* ISO2022 specific initialize routine.  */
9153   for (i = 0; i < 0x20; i++)
9154     iso_code_class[i] = ISO_control_0;
9155   for (i = 0x21; i < 0x7F; i++)
9156     iso_code_class[i] = ISO_graphic_plane_0;
9157   for (i = 0x80; i < 0xA0; i++)
9158     iso_code_class[i] = ISO_control_1;
9159   for (i = 0xA1; i < 0xFF; i++)
9160     iso_code_class[i] = ISO_graphic_plane_1;
9161   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9162   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9163   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9164   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9165   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9166   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9167   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9168   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9169   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9170
9171   for (i = 0; i < 256; i++)
9172     {
9173       emacs_mule_bytes[i] = 1;
9174     }
9175   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9176   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9177   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9178   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9179 }
9180
9181 #ifdef emacs
9182
9183 void
9184 syms_of_coding ()
9185 {
9186   staticpro (&Vcoding_system_hash_table);
9187   {
9188     Lisp_Object args[2];
9189     args[0] = QCtest;
9190     args[1] = Qeq;
9191     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9192   }
9193
9194   staticpro (&Vsjis_coding_system);
9195   Vsjis_coding_system = Qnil;
9196
9197   staticpro (&Vbig5_coding_system);
9198   Vbig5_coding_system = Qnil;
9199
9200   staticpro (&Vcode_conversion_reused_workbuf);
9201   Vcode_conversion_reused_workbuf = Qnil;
9202
9203   staticpro (&Vcode_conversion_workbuf_name);
9204   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9205
9206   reused_workbuf_in_use = 0;
9207
9208   DEFSYM (Qcharset, "charset");
9209   DEFSYM (Qtarget_idx, "target-idx");
9210   DEFSYM (Qcoding_system_history, "coding-system-history");
9211   Fset (Qcoding_system_history, Qnil);
9212
9213   /* Target FILENAME is the first argument.  */
9214   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9215   /* Target FILENAME is the third argument.  */
9216   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9217
9218   DEFSYM (Qcall_process, "call-process");
9219   /* Target PROGRAM is the first argument.  */
9220   Fput (Qcall_process, Qtarget_idx, make_number (0));
9221
9222   DEFSYM (Qcall_process_region, "call-process-region");
9223   /* Target PROGRAM is the third argument.  */
9224   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9225
9226   DEFSYM (Qstart_process, "start-process");
9227   /* Target PROGRAM is the third argument.  */
9228   Fput (Qstart_process, Qtarget_idx, make_number (2));
9229
9230   DEFSYM (Qopen_network_stream, "open-network-stream");
9231   /* Target SERVICE is the fourth argument.  */
9232   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9233
9234   DEFSYM (Qcoding_system, "coding-system");
9235   DEFSYM (Qcoding_aliases, "coding-aliases");
9236
9237   DEFSYM (Qeol_type, "eol-type");
9238   DEFSYM (Qunix, "unix");
9239   DEFSYM (Qdos, "dos");
9240
9241   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9242   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9243   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9244   DEFSYM (Qdefault_char, "default-char");
9245   DEFSYM (Qundecided, "undecided");
9246   DEFSYM (Qno_conversion, "no-conversion");
9247   DEFSYM (Qraw_text, "raw-text");
9248
9249   DEFSYM (Qiso_2022, "iso-2022");
9250
9251   DEFSYM (Qutf_8, "utf-8");
9252   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9253
9254   DEFSYM (Qutf_16, "utf-16");
9255   DEFSYM (Qbig, "big");
9256   DEFSYM (Qlittle, "little");
9257
9258   DEFSYM (Qshift_jis, "shift-jis");
9259   DEFSYM (Qbig5, "big5");
9260
9261   DEFSYM (Qcoding_system_p, "coding-system-p");
9262
9263   DEFSYM (Qcoding_system_error, "coding-system-error");
9264   Fput (Qcoding_system_error, Qerror_conditions,
9265         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9266   Fput (Qcoding_system_error, Qerror_message,
9267         build_string ("Invalid coding system"));
9268
9269   /* Intern this now in case it isn't already done.
9270      Setting this variable twice is harmless.
9271      But don't staticpro it here--that is done in alloc.c.  */
9272   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9273
9274   DEFSYM (Qtranslation_table, "translation-table");
9275   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9276   DEFSYM (Qtranslation_table_id, "translation-table-id");
9277   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9278   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9279
9280   DEFSYM (Qvalid_codes, "valid-codes");
9281
9282   DEFSYM (Qemacs_mule, "emacs-mule");
9283
9284   DEFSYM (QCcategory, ":category");
9285   DEFSYM (QCmnemonic, ":mnemonic");
9286   DEFSYM (QCdefalut_char, ":default-char");
9287   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9288   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9289   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9290   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9291   DEFSYM (QCascii_compatible_p, ":ascii-compatible-p");
9292
9293   Vcoding_category_table
9294     = Fmake_vector (make_number (coding_category_max), Qnil);
9295   staticpro (&Vcoding_category_table);
9296   /* Followings are target of code detection.  */
9297   ASET (Vcoding_category_table, coding_category_iso_7,
9298         intern ("coding-category-iso-7"));
9299   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9300         intern ("coding-category-iso-7-tight"));
9301   ASET (Vcoding_category_table, coding_category_iso_8_1,
9302         intern ("coding-category-iso-8-1"));
9303   ASET (Vcoding_category_table, coding_category_iso_8_2,
9304         intern ("coding-category-iso-8-2"));
9305   ASET (Vcoding_category_table, coding_category_iso_7_else,
9306         intern ("coding-category-iso-7-else"));
9307   ASET (Vcoding_category_table, coding_category_iso_8_else,
9308         intern ("coding-category-iso-8-else"));
9309   ASET (Vcoding_category_table, coding_category_utf_8,
9310         intern ("coding-category-utf-8"));
9311   ASET (Vcoding_category_table, coding_category_utf_16_be,
9312         intern ("coding-category-utf-16-be"));
9313   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9314         intern ("coding-category-utf-16-auto"));
9315   ASET (Vcoding_category_table, coding_category_utf_16_le,
9316         intern ("coding-category-utf-16-le"));
9317   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9318         intern ("coding-category-utf-16-be-nosig"));
9319   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9320         intern ("coding-category-utf-16-le-nosig"));
9321   ASET (Vcoding_category_table, coding_category_charset,
9322         intern ("coding-category-charset"));
9323   ASET (Vcoding_category_table, coding_category_sjis,
9324         intern ("coding-category-sjis"));
9325   ASET (Vcoding_category_table, coding_category_big5,
9326         intern ("coding-category-big5"));
9327   ASET (Vcoding_category_table, coding_category_ccl,
9328         intern ("coding-category-ccl"));
9329   ASET (Vcoding_category_table, coding_category_emacs_mule,
9330         intern ("coding-category-emacs-mule"));
9331   /* Followings are NOT target of code detection.  */
9332   ASET (Vcoding_category_table, coding_category_raw_text,
9333         intern ("coding-category-raw-text"));
9334   ASET (Vcoding_category_table, coding_category_undecided,
9335         intern ("coding-category-undecided"));
9336
9337   DEFSYM (Qinsufficient_source, "insufficient-source");
9338   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9339   DEFSYM (Qinvalid_source, "invalid-source");
9340   DEFSYM (Qinterrupted, "interrupted");
9341   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9342   DEFSYM (Qcoding_system_define_form, "coding-system-define-form");
9343
9344   defsubr (&Scoding_system_p);
9345   defsubr (&Sread_coding_system);
9346   defsubr (&Sread_non_nil_coding_system);
9347   defsubr (&Scheck_coding_system);
9348   defsubr (&Sdetect_coding_region);
9349   defsubr (&Sdetect_coding_string);
9350   defsubr (&Sfind_coding_systems_region_internal);
9351   defsubr (&Sunencodable_char_position);
9352   defsubr (&Scheck_coding_systems_region);
9353   defsubr (&Sdecode_coding_region);
9354   defsubr (&Sencode_coding_region);
9355   defsubr (&Sdecode_coding_string);
9356   defsubr (&Sencode_coding_string);
9357   defsubr (&Sdecode_sjis_char);
9358   defsubr (&Sencode_sjis_char);
9359   defsubr (&Sdecode_big5_char);
9360   defsubr (&Sencode_big5_char);
9361   defsubr (&Sset_terminal_coding_system_internal);
9362   defsubr (&Sset_safe_terminal_coding_system_internal);
9363   defsubr (&Sterminal_coding_system);
9364   defsubr (&Sset_keyboard_coding_system_internal);
9365   defsubr (&Skeyboard_coding_system);
9366   defsubr (&Sfind_operation_coding_system);
9367   defsubr (&Sset_coding_system_priority);
9368   defsubr (&Sdefine_coding_system_internal);
9369   defsubr (&Sdefine_coding_system_alias);
9370   defsubr (&Scoding_system_put);
9371   defsubr (&Scoding_system_base);
9372   defsubr (&Scoding_system_plist);
9373   defsubr (&Scoding_system_aliases);
9374   defsubr (&Scoding_system_eol_type);
9375   defsubr (&Scoding_system_priority_list);
9376
9377   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9378                doc: /* List of coding systems.
9379
9380 Do not alter the value of this variable manually.  This variable should be
9381 updated by the functions `define-coding-system' and
9382 `define-coding-system-alias'.  */);
9383   Vcoding_system_list = Qnil;
9384
9385   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9386                doc: /* Alist of coding system names.
9387 Each element is one element list of coding system name.
9388 This variable is given to `completing-read' as TABLE argument.
9389
9390 Do not alter the value of this variable manually.  This variable should be
9391 updated by the functions `make-coding-system' and
9392 `define-coding-system-alias'.  */);
9393   Vcoding_system_alist = Qnil;
9394
9395   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9396                doc: /* List of coding-categories (symbols) ordered by priority.
9397
9398 On detecting a coding system, Emacs tries code detection algorithms
9399 associated with each coding-category one by one in this order.  When
9400 one algorithm agrees with a byte sequence of source text, the coding
9401 system bound to the corresponding coding-category is selected.
9402
9403 Don't modify this variable directly, but use `set-coding-priority'.  */);
9404   {
9405     int i;
9406
9407     Vcoding_category_list = Qnil;
9408     for (i = coding_category_max - 1; i >= 0; i--)
9409       Vcoding_category_list
9410         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9411                  Vcoding_category_list);
9412   }
9413
9414   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9415                doc: /* Specify the coding system for read operations.
9416 It is useful to bind this variable with `let', but do not set it globally.
9417 If the value is a coding system, it is used for decoding on read operation.
9418 If not, an appropriate element is used from one of the coding system alists:
9419 There are three such tables, `file-coding-system-alist',
9420 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9421   Vcoding_system_for_read = Qnil;
9422
9423   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9424                doc: /* Specify the coding system for write operations.
9425 Programs bind this variable with `let', but you should not set it globally.
9426 If the value is a coding system, it is used for encoding of output,
9427 when writing it to a file and when sending it to a file or subprocess.
9428
9429 If this does not specify a coding system, an appropriate element
9430 is used from one of the coding system alists:
9431 There are three such tables, `file-coding-system-alist',
9432 `process-coding-system-alist', and `network-coding-system-alist'.
9433 For output to files, if the above procedure does not specify a coding system,
9434 the value of `buffer-file-coding-system' is used.  */);
9435   Vcoding_system_for_write = Qnil;
9436
9437   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9438                doc: /*
9439 Coding system used in the latest file or process I/O.  */);
9440   Vlast_coding_system_used = Qnil;
9441
9442   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9443                doc: /*
9444 Error status of the last code conversion.
9445
9446 When an error was detected in the last code conversion, this variable
9447 is set to one of the following symbols.
9448   `insufficient-source'
9449   `inconsistent-eol'
9450   `invalid-source'
9451   `interrupted'
9452   `insufficient-memory'
9453 When no error was detected, the value doesn't change.  So, to check
9454 the error status of a code conversion by this variable, you must
9455 explicitly set this variable to nil before performing code
9456 conversion.  */);
9457   Vlast_code_conversion_error = Qnil;
9458
9459   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9460                doc: /*
9461 *Non-nil means always inhibit code conversion of end-of-line format.
9462 See info node `Coding Systems' and info node `Text and Binary' concerning
9463 such conversion.  */);
9464   inhibit_eol_conversion = 0;
9465
9466   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9467                doc: /*
9468 Non-nil means process buffer inherits coding system of process output.
9469 Bind it to t if the process output is to be treated as if it were a file
9470 read from some filesystem.  */);
9471   inherit_process_coding_system = 0;
9472
9473   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9474                doc: /*
9475 Alist to decide a coding system to use for a file I/O operation.
9476 The format is ((PATTERN . VAL) ...),
9477 where PATTERN is a regular expression matching a file name,
9478 VAL is a coding system, a cons of coding systems, or a function symbol.
9479 If VAL is a coding system, it is used for both decoding and encoding
9480 the file contents.
9481 If VAL is a cons of coding systems, the car part is used for decoding,
9482 and the cdr part is used for encoding.
9483 If VAL is a function symbol, the function must return a coding system
9484 or a cons of coding systems which are used as above.  The function gets
9485 the arguments with which `find-operation-coding-systems' was called.
9486
9487 See also the function `find-operation-coding-system'
9488 and the variable `auto-coding-alist'.  */);
9489   Vfile_coding_system_alist = Qnil;
9490
9491   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9492                doc: /*
9493 Alist to decide a coding system to use for a process I/O operation.
9494 The format is ((PATTERN . VAL) ...),
9495 where PATTERN is a regular expression matching a program name,
9496 VAL is a coding system, a cons of coding systems, or a function symbol.
9497 If VAL is a coding system, it is used for both decoding what received
9498 from the program and encoding what sent to the program.
9499 If VAL is a cons of coding systems, the car part is used for decoding,
9500 and the cdr part is used for encoding.
9501 If VAL is a function symbol, the function must return a coding system
9502 or a cons of coding systems which are used as above.
9503
9504 See also the function `find-operation-coding-system'.  */);
9505   Vprocess_coding_system_alist = Qnil;
9506
9507   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9508                doc: /*
9509 Alist to decide a coding system to use for a network I/O operation.
9510 The format is ((PATTERN . VAL) ...),
9511 where PATTERN is a regular expression matching a network service name
9512 or is a port number to connect to,
9513 VAL is a coding system, a cons of coding systems, or a function symbol.
9514 If VAL is a coding system, it is used for both decoding what received
9515 from the network stream and encoding what sent to the network stream.
9516 If VAL is a cons of coding systems, the car part is used for decoding,
9517 and the cdr part is used for encoding.
9518 If VAL is a function symbol, the function must return a coding system
9519 or a cons of coding systems which are used as above.
9520
9521 See also the function `find-operation-coding-system'.  */);
9522   Vnetwork_coding_system_alist = Qnil;
9523
9524   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9525                doc: /* Coding system to use with system messages.
9526 Also used for decoding keyboard input on X Window system.  */);
9527   Vlocale_coding_system = Qnil;
9528
9529   /* The eol mnemonics are reset in startup.el system-dependently.  */
9530   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9531                doc: /*
9532 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9533   eol_mnemonic_unix = build_string (":");
9534
9535   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9536                doc: /*
9537 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9538   eol_mnemonic_dos = build_string ("\\");
9539
9540   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9541                doc: /*
9542 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9543   eol_mnemonic_mac = build_string ("/");
9544
9545   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9546                doc: /*
9547 *String displayed in mode line when end-of-line format is not yet determined.  */);
9548   eol_mnemonic_undecided = build_string (":");
9549
9550   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9551                doc: /*
9552 *Non-nil enables character translation while encoding and decoding.  */);
9553   Venable_character_translation = Qt;
9554
9555   DEFVAR_LISP ("standard-translation-table-for-decode",
9556                &Vstandard_translation_table_for_decode,
9557                doc: /* Table for translating characters while decoding.  */);
9558   Vstandard_translation_table_for_decode = Qnil;
9559
9560   DEFVAR_LISP ("standard-translation-table-for-encode",
9561                &Vstandard_translation_table_for_encode,
9562                doc: /* Table for translating characters while encoding.  */);
9563   Vstandard_translation_table_for_encode = Qnil;
9564
9565   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9566                doc: /* Alist of charsets vs revision numbers.
9567 While encoding, if a charset (car part of an element) is found,
9568 designate it with the escape sequence identifying revision (cdr part
9569 of the element).  */);
9570   Vcharset_revision_table = Qnil;
9571
9572   DEFVAR_LISP ("default-process-coding-system",
9573                &Vdefault_process_coding_system,
9574                doc: /* Cons of coding systems used for process I/O by default.
9575 The car part is used for decoding a process output,
9576 the cdr part is used for encoding a text to be sent to a process.  */);
9577   Vdefault_process_coding_system = Qnil;
9578
9579   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9580                doc: /*
9581 Table of extra Latin codes in the range 128..159 (inclusive).
9582 This is a vector of length 256.
9583 If Nth element is non-nil, the existence of code N in a file
9584 \(or output of subprocess) doesn't prevent it to be detected as
9585 a coding system of ISO 2022 variant which has a flag
9586 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9587 or reading output of a subprocess.
9588 Only 128th through 159th elements has a meaning.  */);
9589   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9590
9591   DEFVAR_LISP ("select-safe-coding-system-function",
9592                &Vselect_safe_coding_system_function,
9593                doc: /*
9594 Function to call to select safe coding system for encoding a text.
9595
9596 If set, this function is called to force a user to select a proper
9597 coding system which can encode the text in the case that a default
9598 coding system used in each operation can't encode the text.
9599
9600 The default value is `select-safe-coding-system' (which see).  */);
9601   Vselect_safe_coding_system_function = Qnil;
9602
9603   DEFVAR_BOOL ("coding-system-require-warning",
9604                &coding_system_require_warning,
9605                doc: /* Internal use only.
9606 If non-nil, on writing a file, `select-safe-coding-system-function' is
9607 called even if `coding-system-for-write' is non-nil.  The command
9608 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9609   coding_system_require_warning = 0;
9610
9611
9612   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9613                &inhibit_iso_escape_detection,
9614                doc: /*
9615 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9616
9617 By default, on reading a file, Emacs tries to detect how the text is
9618 encoded.  This code detection is sensitive to escape sequences.  If
9619 the sequence is valid as ISO2022, the code is determined as one of
9620 the ISO2022 encodings, and the file is decoded by the corresponding
9621 coding system (e.g. `iso-2022-7bit').
9622
9623 However, there may be a case that you want to read escape sequences in
9624 a file as is.  In such a case, you can set this variable to non-nil.
9625 Then, as the code detection ignores any escape sequences, no file is
9626 detected as encoded in some ISO2022 encoding.  The result is that all
9627 escape sequences become visible in a buffer.
9628
9629 The default value is nil, and it is strongly recommended not to change
9630 it.  That is because many Emacs Lisp source files that contain
9631 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9632 in Emacs's distribution, and they won't be decoded correctly on
9633 reading if you suppress escape sequence detection.
9634
9635 The other way to read escape sequences in a file without decoding is
9636 to explicitly specify some coding system that doesn't use ISO2022's
9637 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9638   inhibit_iso_escape_detection = 0;
9639
9640   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9641                doc: /* Char table for translating self-inserting characters.
9642 This is applied to the result of input methods, not their input.  See also
9643 `keyboard-translate-table'.  */);
9644     Vtranslation_table_for_input = Qnil;
9645
9646   {
9647     Lisp_Object args[coding_arg_max];
9648     Lisp_Object plist[16];
9649     int i;
9650
9651     for (i = 0; i < coding_arg_max; i++)
9652       args[i] = Qnil;
9653
9654     plist[0] = intern (":name");
9655     plist[1] = args[coding_arg_name] = Qno_conversion;
9656     plist[2] = intern (":mnemonic");
9657     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9658     plist[4] = intern (":coding-type");
9659     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9660     plist[6] = intern (":ascii-compatible-p");
9661     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9662     plist[8] = intern (":default-char");
9663     plist[9] = args[coding_arg_default_char] = make_number (0);
9664     plist[10] = intern (":for-unibyte");
9665     plist[11] = args[coding_arg_for_unibyte] = Qt;
9666     plist[12] = intern (":docstring");
9667     plist[13] = build_string ("Do no conversion.\n\
9668 \n\
9669 When you visit a file with this coding, the file is read into a\n\
9670 unibyte buffer as is, thus each byte of a file is treated as a\n\
9671 character.");
9672     plist[14] = intern (":eol-type");
9673     plist[15] = args[coding_arg_eol_type] = Qunix;
9674     args[coding_arg_plist] = Flist (16, plist);
9675     Fdefine_coding_system_internal (coding_arg_max, args);
9676
9677     plist[1] = args[coding_arg_name] = Qundecided;
9678     plist[3] = args[coding_arg_mnemonic] = make_number ('-');
9679     plist[5] = args[coding_arg_coding_type] = Qundecided;
9680     /* This is already set.
9681        plist[7] = args[coding_arg_ascii_compatible_p] = Qt; */
9682     plist[8] = intern (":charset-list");
9683     plist[9] = args[coding_arg_charset_list] = Fcons (Qascii, Qnil);
9684     plist[11] = args[coding_arg_for_unibyte] = Qnil;
9685     plist[13] = build_string ("No conversion on encoding, automatic conversion on decoding.");
9686     plist[15] = args[coding_arg_eol_type] = Qnil;
9687     args[coding_arg_plist] = Flist (16, plist);
9688     Fdefine_coding_system_internal (coding_arg_max, args);
9689   }
9690
9691   setup_coding_system (Qno_conversion, &keyboard_coding);
9692   setup_coding_system (Qundecided, &terminal_coding);
9693   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9694
9695   {
9696     int i;
9697
9698     for (i = 0; i < coding_category_max; i++)
9699       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9700   }
9701 }
9702
9703 char *
9704 emacs_strerror (error_number)
9705      int error_number;
9706 {
9707   char *str;
9708
9709   synchronize_system_messages_locale ();
9710   str = strerror (error_number);
9711
9712   if (! NILP (Vlocale_coding_system))
9713     {
9714       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9715                                                       Vlocale_coding_system,
9716                                                       0);
9717       str = (char *) SDATA (dec);
9718     }
9719
9720   return str;
9721 }
9722
9723 #endif /* emacs */
9724
9725 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9726    (do not change this comment) */