src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf + coding->charbuf_used;
 216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 int coding_system_require_warning;
 327
 328 Lisp_Object Vselect_safe_coding_system_function;
 329
 330 /* Mnemonic string for each format of end-of-line.  */
 331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 332 /* Mnemonic string to indicate format of end-of-line is not yet
 333    decided.  */
 334 Lisp_Object eol_mnemonic_undecided;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding-system for reading files and receiving data from process.  */
 350 Lisp_Object Vcoding_system_for_read;
 351 /* Coding-system for writing files and sending data to process.  */
 352 Lisp_Object Vcoding_system_for_write;
 353 /* Coding-system actually used in the latest I/O.  */
 354 Lisp_Object Vlast_coding_system_used;
 355 /* Set to non-nil when an error is detected while code conversion.  */
 356 Lisp_Object Vlast_code_conversion_error;
 357 /* A vector of length 256 which contains information about special
 358    Latin codes (especially for dealing with Microsoft codes).  */
 359 Lisp_Object Vlatin_extra_code_table;
 360
 361 /* Flag to inhibit code conversion of end-of-line format.  */
 362 int inhibit_eol_conversion;
 363
 364 /* Flag to inhibit ISO2022 escape sequence detection.  */
 365 int inhibit_iso_escape_detection;
 366
 367 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 368 int inherit_process_coding_system;
 369
 370 /* Coding system to be used to encode text for terminal display.  */
 371 struct coding_system terminal_coding;
 372
 373 /* Coding system to be used to encode text for terminal display when
 374    terminal coding system is nil.  */
 375 struct coding_system safe_terminal_coding;
 376
 377 /* Coding system of what is sent from terminal keyboard.  */
 378 struct coding_system keyboard_coding;
 379
 380 Lisp_Object Vfile_coding_system_alist;
 381 Lisp_Object Vprocess_coding_system_alist;
 382 Lisp_Object Vnetwork_coding_system_alist;
 383
 384 Lisp_Object Vlocale_coding_system;
 385
 386 #endif /* emacs */
 387
 388 /* Flag to tell if we look up translation table on character code
 389    conversion.  */
 390 Lisp_Object Venable_character_translation;
 391 /* Standard translation table to look up on decoding (reading).  */
 392 Lisp_Object Vstandard_translation_table_for_decode;
 393 /* Standard translation table to look up on encoding (writing).  */
 394 Lisp_Object Vstandard_translation_table_for_encode;
 395
 396 Lisp_Object Qtranslation_table;
 397 Lisp_Object Qtranslation_table_id;
 398 Lisp_Object Qtranslation_table_for_decode;
 399 Lisp_Object Qtranslation_table_for_encode;
 400
 401 /* Alist of charsets vs revision number.  */
 402 static Lisp_Object Vcharset_revision_table;
 403
 404 /* Default coding systems used for process I/O.  */
 405 Lisp_Object Vdefault_process_coding_system;
 406
 407 /* Char table for translating Quail and self-inserting input.  */
 408 Lisp_Object Vtranslation_table_for_input;
 409
 410 /* Two special coding systems.  */
 411 Lisp_Object Vsjis_coding_system;
 412 Lisp_Object Vbig5_coding_system;
 413
 414 /* ISO2022 section */
 415
 416 #define CODING_ISO_INITIAL(coding, reg)                 \
 417   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 418                      coding_attr_iso_initial),          \
 419                reg)))
 420
 421
 422 #define CODING_ISO_REQUEST(coding, charset_id)  \
 423   ((charset_id <= (coding)->max_charset_id      \
 424     ? (coding)->safe_charsets[charset_id]       \
 425     : -1))
 426
 427
 428 #define CODING_ISO_FLAGS(coding)        \
 429   ((coding)->spec.iso_2022.flags)
 430 #define CODING_ISO_DESIGNATION(coding, reg)     \
 431   ((coding)->spec.iso_2022.current_designation[reg])
 432 #define CODING_ISO_INVOCATION(coding, plane)    \
 433   ((coding)->spec.iso_2022.current_invocation[plane])
 434 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 435   ((coding)->spec.iso_2022.single_shifting)
 436 #define CODING_ISO_BOL(coding)  \
 437   ((coding)->spec.iso_2022.bol)
 438 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 439   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 440
 441 /* Control characters of ISO2022.  */
 442                         /* code */      /* function */
 443 #define ISO_CODE_LF     0x0A            /* line-feed */
 444 #define ISO_CODE_CR     0x0D            /* carriage-return */
 445 #define ISO_CODE_SO     0x0E            /* shift-out */
 446 #define ISO_CODE_SI     0x0F            /* shift-in */
 447 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 448 #define ISO_CODE_ESC    0x1B            /* escape */
 449 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 450 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 451 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 452
 453 /* All code (1-byte) of ISO2022 is classified into one of the
 454    followings.  */
 455 enum iso_code_class_type
 456   {
 457     ISO_control_0,              /* Control codes in the range
 458                                    0x00..0x1F and 0x7F, except for the
 459                                    following 5 codes.  */
 460     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 461     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 462     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 463     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 464     ISO_control_1,              /* Control codes in the range
 465                                    0x80..0x9F, except for the
 466                                    following 3 codes.  */
 467     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 468     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 469     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 470     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 471     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 472     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 473     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 474   };
 475
 476 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 477     `iso-flags' attribute of an iso2022 coding system.  */
 478
 479 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 480    instead of the correct short-form sequence (e.g. ESC $ A).  */
 481 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 482
 483 /* If set, reset graphic planes and registers at end-of-line to the
 484    initial state.  */
 485 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 486
 487 /* If set, reset graphic planes and registers before any control
 488    characters to the initial state.  */
 489 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 490
 491 /* If set, encode by 7-bit environment.  */
 492 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 493
 494 /* If set, use locking-shift function.  */
 495 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 496
 497 /* If set, use single-shift function.  Overwrite
 498    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 499 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 500
 501 /* If set, use designation escape sequence.  */
 502 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 503
 504 /* If set, produce revision number sequence.  */
 505 #define CODING_ISO_FLAG_REVISION        0x0080
 506
 507 /* If set, produce ISO6429's direction specifying sequence.  */
 508 #define CODING_ISO_FLAG_DIRECTION       0x0100
 509
 510 /* If set, assume designation states are reset at beginning of line on
 511    output.  */
 512 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 513
 514 /* If set, designation sequence should be placed at beginning of line
 515    on output.  */
 516 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 517
 518 /* If set, do not encode unsafe charactes on output.  */
 519 #define CODING_ISO_FLAG_SAFE            0x0800
 520
 521 /* If set, extra latin codes (128..159) are accepted as a valid code
 522    on input.  */
 523 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 524
 525 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 526
 527 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 528
 529 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 530
 531 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 532
 533 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 534
 535 /* A character to be produced on output if encoding of the original
 536    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 537 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 538
 539
 540 /* UTF-16 section */
 541 #define CODING_UTF_16_BOM(coding)       \
 542   ((coding)->spec.utf_16.bom)
 543
 544 #define CODING_UTF_16_ENDIAN(coding)    \
 545   ((coding)->spec.utf_16.endian)
 546
 547 #define CODING_UTF_16_SURROGATE(coding) \
 548   ((coding)->spec.utf_16.surrogate)
 549
 550
 551 /* CCL section */
 552 #define CODING_CCL_DECODER(coding)      \
 553   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 554 #define CODING_CCL_ENCODER(coding)      \
 555   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 556 #define CODING_CCL_VALIDS(coding)                                          \
 557   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 558
 559 /* Index for each coding category in `coding_categories' */
 560
 561 enum coding_category
 562   {
 563     coding_category_iso_7,
 564     coding_category_iso_7_tight,
 565     coding_category_iso_8_1,
 566     coding_category_iso_8_2,
 567     coding_category_iso_7_else,
 568     coding_category_iso_8_else,
 569     coding_category_utf_8,
 570     coding_category_utf_16_auto,
 571     coding_category_utf_16_be,
 572     coding_category_utf_16_le,
 573     coding_category_utf_16_be_nosig,
 574     coding_category_utf_16_le_nosig,
 575     coding_category_charset,
 576     coding_category_sjis,
 577     coding_category_big5,
 578     coding_category_ccl,
 579     coding_category_emacs_mule,
 580     /* All above are targets of code detection.  */
 581     coding_category_raw_text,
 582     coding_category_undecided,
 583     coding_category_max
 584   };
 585
 586 /* Definitions of flag bits used in detect_coding_XXXX.  */
 587 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 588 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 589 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 590 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 591 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 592 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 593 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 594 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 595 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 596 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 597 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 598 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 599 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 600 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 601 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 602 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 603 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 604 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 605
 606 /* This value is returned if detect_coding_mask () find nothing other
 607    than ASCII characters.  */
 608 #define CATEGORY_MASK_ANY               \
 609   (CATEGORY_MASK_ISO_7                  \
 610    | CATEGORY_MASK_ISO_7_TIGHT          \
 611    | CATEGORY_MASK_ISO_8_1              \
 612    | CATEGORY_MASK_ISO_8_2              \
 613    | CATEGORY_MASK_ISO_7_ELSE           \
 614    | CATEGORY_MASK_ISO_8_ELSE           \
 615    | CATEGORY_MASK_UTF_8                \
 616    | CATEGORY_MASK_UTF_16_BE            \
 617    | CATEGORY_MASK_UTF_16_LE            \
 618    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 619    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 620    | CATEGORY_MASK_CHARSET              \
 621    | CATEGORY_MASK_SJIS                 \
 622    | CATEGORY_MASK_BIG5                 \
 623    | CATEGORY_MASK_CCL                  \
 624    | CATEGORY_MASK_EMACS_MULE)
 625
 626
 627 #define CATEGORY_MASK_ISO_7BIT \
 628   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 629
 630 #define CATEGORY_MASK_ISO_8BIT \
 631   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 632
 633 #define CATEGORY_MASK_ISO_ELSE \
 634   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 635
 636 #define CATEGORY_MASK_ISO_ESCAPE        \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_7_ELSE           \
 640    | CATEGORY_MASK_ISO_8_ELSE)
 641
 642 #define CATEGORY_MASK_ISO       \
 643   (  CATEGORY_MASK_ISO_7BIT     \
 644      | CATEGORY_MASK_ISO_8BIT   \
 645      | CATEGORY_MASK_ISO_ELSE)
 646
 647 #define CATEGORY_MASK_UTF_16            \
 648   (CATEGORY_MASK_UTF_16_BE              \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 652
 653
 654 /* List of symbols `coding-category-xxx' ordered by priority.  This
 655    variable is exposed to Emacs Lisp.  */
 656 static Lisp_Object Vcoding_category_list;
 657
 658 /* Table of coding categories (Lisp symbols).  This variable is for
 659    internal use oly.  */
 660 static Lisp_Object Vcoding_category_table;
 661
 662 /* Table of coding-categories ordered by priority.  */
 663 static enum coding_category coding_priorities[coding_category_max];
 664
 665 /* Nth element is a coding context for the coding system bound to the
 666    Nth coding category.  */
 667 static struct coding_system coding_categories[coding_category_max];
 668
 669 /*** Commonly used macros and functions ***/
 670
 671 #ifndef min
 672 #define min(a, b) ((a) < (b) ? (a) : (b))
 673 #endif
 674 #ifndef max
 675 #define max(a, b) ((a) > (b) ? (a) : (b))
 676 #endif
 677
 678 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 679   do {                                                  \
 680     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 681     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 682   } while (0)
 683
 684
 685 /* Safely get one byte from the source text pointed by SRC which ends
 686    at SRC_END, and set C to that byte.  If there are not enough bytes
 687    in the source, it jumps to `no_more_source'.  If multibytep is
 688    nonzero, and a multibyte character is found at SRC, set C to the
 689    negative value of the character code.  The caller should declare
 690    and set these variables appropriately in advance:
 691         src, src_end, multibytep */
 692
 693 #define ONE_MORE_BYTE(c)                                \
 694   do {                                                  \
 695     if (src == src_end)                                 \
 696       {                                                 \
 697         if (src_base < src)                             \
 698           record_conversion_result                      \
 699             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 700         goto no_more_source;                            \
 701       }                                                 \
 702     c = *src++;                                         \
 703     if (multibytep && (c & 0x80))                       \
 704       {                                                 \
 705         if ((c & 0xFE) == 0xC0)                         \
 706           c = ((c & 1) << 6) | *src++;                  \
 707         else                                            \
 708           {                                             \
 709             c = - string_char (--src, &src, NULL);      \
 710             record_conversion_result                    \
 711               (coding, CODING_RESULT_INVALID_SRC);      \
 712           }                                             \
 713       }                                                 \
 714     consumed_chars++;                                   \
 715   } while (0)
 716
 717
 718 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 719   do {                                                  \
 720     c = *src++;                                         \
 721     if (multibytep && (c & 0x80))                       \
 722       {                                                 \
 723         if ((c & 0xFE) == 0xC0)                         \
 724           c = ((c & 1) << 6) | *src++;                  \
 725         else                                            \
 726           {                                             \
 727             c = - string_char (--src, &src, NULL);      \
 728             record_conversion_result                    \
 729               (coding, CODING_RESULT_INVALID_SRC);      \
 730           }                                             \
 731       }                                                 \
 732     consumed_chars++;                                   \
 733   } while (0)
 734
 735
 736 /* Store a byte C in the place pointed by DST and increment DST to the
 737    next free point, and increment PRODUCED_CHARS.  The caller should
 738    assure that C is 0..127, and declare and set the variable `dst'
 739    appropriately in advance.
 740 */
 741
 742
 743 #define EMIT_ONE_ASCII_BYTE(c)  \
 744   do {                          \
 745     produced_chars++;           \
 746     *dst++ = (c);               \
 747   } while (0)
 748
 749
 750 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 751
 752 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 753   do {                                  \
 754     produced_chars += 2;                \
 755     *dst++ = (c1), *dst++ = (c2);       \
 756   } while (0)
 757
 758
 759 /* Store a byte C in the place pointed by DST and increment DST to the
 760    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 761    nonzero, store in an appropriate multibyte from.  The caller should
 762    declare and set the variables `dst' and `multibytep' appropriately
 763    in advance.  */
 764
 765 #define EMIT_ONE_BYTE(c)                \
 766   do {                                  \
 767     produced_chars++;                   \
 768     if (multibytep)                     \
 769       {                                 \
 770         int ch = (c);                   \
 771         if (ch >= 0x80)                 \
 772           ch = BYTE8_TO_CHAR (ch);      \
 773         CHAR_STRING_ADVANCE (ch, dst);  \
 774       }                                 \
 775     else                                \
 776       *dst++ = (c);                     \
 777   } while (0)
 778
 779
 780 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 781
 782 #define EMIT_TWO_BYTES(c1, c2)          \
 783   do {                                  \
 784     produced_chars += 2;                \
 785     if (multibytep)                     \
 786       {                                 \
 787         int ch;                         \
 788                                         \
 789         ch = (c1);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793         ch = (c2);                      \
 794         if (ch >= 0x80)                 \
 795           ch = BYTE8_TO_CHAR (ch);      \
 796         CHAR_STRING_ADVANCE (ch, dst);  \
 797       }                                 \
 798     else                                \
 799       {                                 \
 800         *dst++ = (c1);                  \
 801         *dst++ = (c2);                  \
 802       }                                 \
 803   } while (0)
 804
 805
 806 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 807   do {                                  \
 808     EMIT_ONE_BYTE (c1);                 \
 809     EMIT_TWO_BYTES (c2, c3);            \
 810   } while (0)
 811
 812
 813 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 814   do {                                          \
 815     EMIT_TWO_BYTES (c1, c2);                    \
 816     EMIT_TWO_BYTES (c3, c4);                    \
 817   } while (0)
 818
 819
 820 /* Prototypes for static functions.  */
 821 static void record_conversion_result P_ ((struct coding_system *coding,
 822                                           enum coding_result_code result));
 823 static int detect_coding_utf_8 P_ ((struct coding_system *,
 824                                     struct coding_detection_info *info));
 825 static void decode_coding_utf_8 P_ ((struct coding_system *));
 826 static int encode_coding_utf_8 P_ ((struct coding_system *));
 827
 828 static int detect_coding_utf_16 P_ ((struct coding_system *,
 829                                      struct coding_detection_info *info));
 830 static void decode_coding_utf_16 P_ ((struct coding_system *));
 831 static int encode_coding_utf_16 P_ ((struct coding_system *));
 832
 833 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 834                                        struct coding_detection_info *info));
 835 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 836 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 837
 838 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 839                                          struct coding_detection_info *info));
 840 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 841 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 842
 843 static int detect_coding_sjis P_ ((struct coding_system *,
 844                                    struct coding_detection_info *info));
 845 static void decode_coding_sjis P_ ((struct coding_system *));
 846 static int encode_coding_sjis P_ ((struct coding_system *));
 847
 848 static int detect_coding_big5 P_ ((struct coding_system *,
 849                                    struct coding_detection_info *info));
 850 static void decode_coding_big5 P_ ((struct coding_system *));
 851 static int encode_coding_big5 P_ ((struct coding_system *));
 852
 853 static int detect_coding_ccl P_ ((struct coding_system *,
 854                                   struct coding_detection_info *info));
 855 static void decode_coding_ccl P_ ((struct coding_system *));
 856 static int encode_coding_ccl P_ ((struct coding_system *));
 857
 858 static void decode_coding_raw_text P_ ((struct coding_system *));
 859 static int encode_coding_raw_text P_ ((struct coding_system *));
 860
 861 static void coding_set_source P_ ((struct coding_system *));
 862 static void coding_set_destination P_ ((struct coding_system *));
 863 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 864 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 865                                             EMACS_INT));
 866 static unsigned char *alloc_destination P_ ((struct coding_system *,
 867                                              EMACS_INT, unsigned char *));
 868 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 869 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 870                                                      int *, int *,
 871                                                      unsigned char *));
 872 static int detect_eol P_ ((const unsigned char *,
 873                            EMACS_INT, enum coding_category));
 874 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 875 static void decode_eol P_ ((struct coding_system *));
 876 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 877 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 878                                         int, int *, int *));
 879 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 880 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 881                                             EMACS_INT));
 882 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 883                                         EMACS_INT));
 884 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 885 static int decode_coding P_ ((struct coding_system *));
 886 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 887                                                       struct coding_system *,
 888                                                       int *, EMACS_INT *));
 889 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 890                                                   struct coding_system *,
 891                                                   int *, EMACS_INT *));
 892 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 893 static int encode_coding P_ ((struct coding_system *));
 894 static Lisp_Object make_conversion_work_buffer P_ ((int));
 895 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 896 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 897 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 898
 899 static void
 900 record_conversion_result (struct coding_system *coding,
 901                           enum coding_result_code result)
 902 {
 903   coding->result = result;
 904   switch (result)
 905     {
 906     case CODING_RESULT_INSUFFICIENT_SRC:
 907       Vlast_code_conversion_error = Qinsufficient_source;
 908       break;
 909     case CODING_RESULT_INCONSISTENT_EOL:
 910       Vlast_code_conversion_error = Qinconsistent_eol;
 911       break;
 912     case CODING_RESULT_INVALID_SRC:
 913       Vlast_code_conversion_error = Qinvalid_source;
 914       break;
 915     case CODING_RESULT_INTERRUPT:
 916       Vlast_code_conversion_error = Qinterrupted;
 917       break;
 918     case CODING_RESULT_INSUFFICIENT_MEM:
 919       Vlast_code_conversion_error = Qinsufficient_memory;
 920       break;
 921     }
 922 }
 923
 924 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 925   do {                                                                       \
 926     charset_map_loaded = 0;                                                  \
 927     c = DECODE_CHAR (charset, code);                                         \
 928     if (charset_map_loaded)                                                  \
 929       {                                                                      \
 930         const unsigned char *orig = coding->source;                          \
 931         EMACS_INT offset;                                                    \
 932                                                                              \
 933         coding_set_source (coding);                                          \
 934         offset = coding->source - orig;                                      \
 935         src += offset;                                                       \
 936         src_base += offset;                                                  \
 937         src_end += offset;                                                   \
 938       }                                                                      \
 939   } while (0)
 940
 941
 942 #define ASSURE_DESTINATION(bytes)                               \
 943   do {                                                          \
 944     if (dst + (bytes) >= dst_end)                               \
 945       {                                                         \
 946         int more_bytes = charbuf_end - charbuf + (bytes);       \
 947                                                                 \
 948         dst = alloc_destination (coding, more_bytes, dst);      \
 949         dst_end = coding->destination + coding->dst_bytes;      \
 950       }                                                         \
 951   } while (0)
 952
 953
 954
 955 static void
 956 coding_set_source (coding)
 957      struct coding_system *coding;
 958 {
 959   if (BUFFERP (coding->src_object))
 960     {
 961       struct buffer *buf = XBUFFER (coding->src_object);
 962
 963       if (coding->src_pos < 0)
 964         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 965       else
 966         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 967     }
 968   else if (STRINGP (coding->src_object))
 969     {
 970       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 971     }
 972   else
 973     /* Otherwise, the source is C string and is never relocated
 974        automatically.  Thus we don't have to update anything.  */
 975     ;
 976 }
 977
 978 static void
 979 coding_set_destination (coding)
 980      struct coding_system *coding;
 981 {
 982   if (BUFFERP (coding->dst_object))
 983     {
 984       if (coding->src_pos < 0)
 985         {
 986           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 987           coding->dst_bytes = (GAP_END_ADDR
 988                                - (coding->src_bytes - coding->consumed)
 989                                - coding->destination);
 990         }
 991       else
 992         {
 993           /* We are sure that coding->dst_pos_byte is before the gap
 994              of the buffer. */
 995           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 996                                  + coding->dst_pos_byte - 1);
 997           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 998                                - coding->destination);
 999         }
1000     }
1001   else
1002     /* Otherwise, the destination is C string and is never relocated
1003        automatically.  Thus we don't have to update anything.  */
1004     ;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (coding, bytes)
1010      struct coding_system *coding;
1011      EMACS_INT bytes;
1012 {
1013   coding->destination = (unsigned char *) xrealloc (coding->destination,
1014                                                     coding->dst_bytes + bytes);
1015   coding->dst_bytes += bytes;
1016 }
1017
1018 static void
1019 coding_alloc_by_making_gap (coding, bytes)
1020      struct coding_system *coding;
1021      EMACS_INT bytes;
1022 {
1023   if (BUFFERP (coding->dst_object)
1024       && EQ (coding->src_object, coding->dst_object))
1025     {
1026       EMACS_INT add = coding->src_bytes - coding->consumed;
1027
1028       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1029       make_gap (bytes);
1030       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1031     }
1032   else
1033     {
1034       Lisp_Object this_buffer;
1035
1036       this_buffer = Fcurrent_buffer ();
1037       set_buffer_internal (XBUFFER (coding->dst_object));
1038       make_gap (bytes);
1039       set_buffer_internal (XBUFFER (this_buffer));
1040     }
1041 }
1042
1043
1044 static unsigned char *
1045 alloc_destination (coding, nbytes, dst)
1046      struct coding_system *coding;
1047      EMACS_INT nbytes;
1048      unsigned char *dst;
1049 {
1050   EMACS_INT offset = dst - coding->destination;
1051
1052   if (BUFFERP (coding->dst_object))
1053     coding_alloc_by_making_gap (coding, nbytes);
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1057   coding_set_destination (coding);
1058   dst = coding->destination + offset;
1059   return dst;
1060 }
1061
1062 /** Macros for annotations.  */
1063
1064 /* Maximum length of annotation data (sum of annotations for
1065    composition and charset).  */
1066 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1067
1068 /* An annotation data is stored in the array coding->charbuf in this
1069    format:
1070      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1071    LENGTH is the number of elements in the annotation.
1072    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1073    NCHARS is the number of characters in the text annotated.
1074
1075    The format of the following elements depend on ANNOTATION_MASK.
1076
1077    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1078    follows:
1079      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1080    METHOD is one of enum composition_method.
1081    Optionnal COMPOSITION-COMPONENTS are characters and composition
1082    rules.
1083
1084    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1085    follows.  */
1086
1087 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1088   do {                                                  \
1089     *(buf)++ = -(len);                                  \
1090     *(buf)++ = (mask);                                  \
1091     *(buf)++ = (nchars);                                \
1092     coding->annotated = 1;                              \
1093   } while (0);
1094
1095 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1096   do {                                                                      \
1097     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1098     *buf++ = method;                                                        \
1099   } while (0)
1100
1101
1102 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1103   do {                                                                  \
1104     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1105     *buf++ = id;                                                        \
1106   } while (0)
1107
1108 \f
1109 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1110
1111
1112
1113 \f
1114 /*** 3. UTF-8 ***/
1115
1116 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1117    Check if a text is encoded in UTF-8.  If it is, return 1, else
1118    return 0.  */
1119
1120 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1121 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1122 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1123 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1124 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1125 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1126
1127 static int
1128 detect_coding_utf_8 (coding, detect_info)
1129      struct coding_system *coding;
1130      struct coding_detection_info *detect_info;
1131 {
1132   const unsigned char *src = coding->source, *src_base;
1133   const unsigned char *src_end = coding->source + coding->src_bytes;
1134   int multibytep = coding->src_multibyte;
1135   int consumed_chars = 0;
1136   int found = 0;
1137
1138   detect_info->checked |= CATEGORY_MASK_UTF_8;
1139   /* A coding system of this category is always ASCII compatible.  */
1140   src += coding->head_ascii;
1141
1142   while (1)
1143     {
1144       int c, c1, c2, c3, c4;
1145
1146       src_base = src;
1147       ONE_MORE_BYTE (c);
1148       if (c < 0 || UTF_8_1_OCTET_P (c))
1149         continue;
1150       ONE_MORE_BYTE (c1);
1151       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1152         break;
1153       if (UTF_8_2_OCTET_LEADING_P (c))
1154         {
1155           found = CATEGORY_MASK_UTF_8;
1156           continue;
1157         }
1158       ONE_MORE_BYTE (c2);
1159       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1160         break;
1161       if (UTF_8_3_OCTET_LEADING_P (c))
1162         {
1163           found = CATEGORY_MASK_UTF_8;
1164           continue;
1165         }
1166       ONE_MORE_BYTE (c3);
1167       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1168         break;
1169       if (UTF_8_4_OCTET_LEADING_P (c))
1170         {
1171           found = CATEGORY_MASK_UTF_8;
1172           continue;
1173         }
1174       ONE_MORE_BYTE (c4);
1175       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1176         break;
1177       if (UTF_8_5_OCTET_LEADING_P (c))
1178         {
1179           found = CATEGORY_MASK_UTF_8;
1180           continue;
1181         }
1182       break;
1183     }
1184   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1185   return 0;
1186
1187  no_more_source:
1188   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1189     {
1190       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1191       return 0;
1192     }
1193   detect_info->found |= found;
1194   return 1;
1195 }
1196
1197
1198 static void
1199 decode_coding_utf_8 (coding)
1200      struct coding_system *coding;
1201 {
1202   const unsigned char *src = coding->source + coding->consumed;
1203   const unsigned char *src_end = coding->source + coding->src_bytes;
1204   const unsigned char *src_base;
1205   int *charbuf = coding->charbuf + coding->charbuf_used;
1206   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1207   int consumed_chars = 0, consumed_chars_base;
1208   int multibytep = coding->src_multibyte;
1209   Lisp_Object attr, charset_list;
1210
1211   CODING_GET_INFO (coding, attr, charset_list);
1212
1213   while (1)
1214     {
1215       int c, c1, c2, c3, c4, c5;
1216
1217       src_base = src;
1218       consumed_chars_base = consumed_chars;
1219
1220       if (charbuf >= charbuf_end)
1221         break;
1222
1223       ONE_MORE_BYTE (c1);
1224       if (c1 < 0)
1225         {
1226           c = - c1;
1227         }
1228       else if (UTF_8_1_OCTET_P(c1))
1229         {
1230           c = c1;
1231         }
1232       else
1233         {
1234           ONE_MORE_BYTE (c2);
1235           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236             goto invalid_code;
1237           if (UTF_8_2_OCTET_LEADING_P (c1))
1238             {
1239               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1240               /* Reject overlong sequences here and below.  Encoders
1241                  producing them are incorrect, they can be misleading,
1242                  and they mess up read/write invariance.  */
1243               if (c < 128)
1244                 goto invalid_code;
1245             }
1246           else
1247             {
1248               ONE_MORE_BYTE (c3);
1249               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1250                 goto invalid_code;
1251               if (UTF_8_3_OCTET_LEADING_P (c1))
1252                 {
1253                   c = (((c1 & 0xF) << 12)
1254                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1255                   if (c < 0x800
1256                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1257                     goto invalid_code;
1258                 }
1259               else
1260                 {
1261                   ONE_MORE_BYTE (c4);
1262                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1263                     goto invalid_code;
1264                   if (UTF_8_4_OCTET_LEADING_P (c1))
1265                     {
1266                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1267                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1268                     if (c < 0x10000)
1269                       goto invalid_code;
1270                     }
1271                   else
1272                     {
1273                       ONE_MORE_BYTE (c5);
1274                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1275                         goto invalid_code;
1276                       if (UTF_8_5_OCTET_LEADING_P (c1))
1277                         {
1278                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1279                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1280                                | (c5 & 0x3F));
1281                           if ((c > MAX_CHAR) || (c < 0x200000))
1282                             goto invalid_code;
1283                         }
1284                       else
1285                         goto invalid_code;
1286                     }
1287                 }
1288             }
1289         }
1290
1291       *charbuf++ = c;
1292       continue;
1293
1294     invalid_code:
1295       src = src_base;
1296       consumed_chars = consumed_chars_base;
1297       ONE_MORE_BYTE (c);
1298       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1299       coding->errors++;
1300     }
1301
1302  no_more_source:
1303   coding->consumed_char += consumed_chars_base;
1304   coding->consumed = src_base - coding->source;
1305   coding->charbuf_used = charbuf - coding->charbuf;
1306 }
1307
1308
1309 static int
1310 encode_coding_utf_8 (coding)
1311      struct coding_system *coding;
1312 {
1313   int multibytep = coding->dst_multibyte;
1314   int *charbuf = coding->charbuf;
1315   int *charbuf_end = charbuf + coding->charbuf_used;
1316   unsigned char *dst = coding->destination + coding->produced;
1317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1318   int produced_chars = 0;
1319   int c;
1320
1321   if (multibytep)
1322     {
1323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1324
1325       while (charbuf < charbuf_end)
1326         {
1327           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1328
1329           ASSURE_DESTINATION (safe_room);
1330           c = *charbuf++;
1331           if (CHAR_BYTE8_P (c))
1332             {
1333               c = CHAR_TO_BYTE8 (c);
1334               EMIT_ONE_BYTE (c);
1335             }
1336           else
1337             {
1338               CHAR_STRING_ADVANCE (c, pend);
1339               for (p = str; p < pend; p++)
1340                 EMIT_ONE_BYTE (*p);
1341             }
1342         }
1343     }
1344   else
1345     {
1346       int safe_room = MAX_MULTIBYTE_LENGTH;
1347
1348       while (charbuf < charbuf_end)
1349         {
1350           ASSURE_DESTINATION (safe_room);
1351           c = *charbuf++;
1352           if (CHAR_BYTE8_P (c))
1353             *dst++ = CHAR_TO_BYTE8 (c);
1354           else
1355             dst += CHAR_STRING (c, dst);
1356           produced_chars++;
1357         }
1358     }
1359   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1360   coding->produced_char += produced_chars;
1361   coding->produced = dst - coding->destination;
1362   return 0;
1363 }
1364
1365
1366 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1367    Check if a text is encoded in one of UTF-16 based coding systems.
1368    If it is, return 1, else return 0.  */
1369
1370 #define UTF_16_HIGH_SURROGATE_P(val) \
1371   (((val) & 0xFC00) == 0xD800)
1372
1373 #define UTF_16_LOW_SURROGATE_P(val) \
1374   (((val) & 0xFC00) == 0xDC00)
1375
1376 #define UTF_16_INVALID_P(val)   \
1377   (((val) == 0xFFFE)            \
1378    || ((val) == 0xFFFF)         \
1379    || UTF_16_LOW_SURROGATE_P (val))
1380
1381
1382 static int
1383 detect_coding_utf_16 (coding, detect_info)
1384      struct coding_system *coding;
1385      struct coding_detection_info *detect_info;
1386 {
1387   const unsigned char *src = coding->source, *src_base = src;
1388   const unsigned char *src_end = coding->source + coding->src_bytes;
1389   int multibytep = coding->src_multibyte;
1390   int consumed_chars = 0;
1391   int c1, c2;
1392
1393   detect_info->checked |= CATEGORY_MASK_UTF_16;
1394   if (coding->mode & CODING_MODE_LAST_BLOCK
1395       && (coding->src_chars & 1))
1396     {
1397       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1398       return 0;
1399     }
1400
1401   ONE_MORE_BYTE (c1);
1402   ONE_MORE_BYTE (c2);
1403   if ((c1 == 0xFF) && (c2 == 0xFE))
1404     {
1405       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1406                              | CATEGORY_MASK_UTF_16_AUTO);
1407       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1408                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1409                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1410     }
1411   else if ((c1 == 0xFE) && (c2 == 0xFF))
1412     {
1413       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1414                              | CATEGORY_MASK_UTF_16_AUTO);
1415       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1416                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1417                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1418     }
1419   else if (c1 >= 0 && c2 >= 0)
1420     {
1421       detect_info->rejected
1422         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1423     }
1424  no_more_source:
1425   return 1;
1426 }
1427
1428 static void
1429 decode_coding_utf_16 (coding)
1430      struct coding_system *coding;
1431 {
1432   const unsigned char *src = coding->source + coding->consumed;
1433   const unsigned char *src_end = coding->source + coding->src_bytes;
1434   const unsigned char *src_base;
1435   int *charbuf = coding->charbuf + coding->charbuf_used;
1436   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1437   int consumed_chars = 0, consumed_chars_base;
1438   int multibytep = coding->src_multibyte;
1439   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1440   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1441   int surrogate = CODING_UTF_16_SURROGATE (coding);
1442   Lisp_Object attr, charset_list;
1443
1444   CODING_GET_INFO (coding, attr, charset_list);
1445
1446   if (bom == utf_16_with_bom)
1447     {
1448       int c, c1, c2;
1449
1450       src_base = src;
1451       ONE_MORE_BYTE (c1);
1452       ONE_MORE_BYTE (c2);
1453       c = (c1 << 8) | c2;
1454
1455       if (endian == utf_16_big_endian
1456           ? c != 0xFEFF : c != 0xFFFE)
1457         {
1458           /* The first two bytes are not BOM.  Treat them as bytes
1459              for a normal character.  */
1460           src = src_base;
1461           coding->errors++;
1462         }
1463       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1464     }
1465   else if (bom == utf_16_detect_bom)
1466     {
1467       /* We have already tried to detect BOM and failed in
1468          detect_coding.  */
1469       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1470     }
1471
1472   while (1)
1473     {
1474       int c, c1, c2;
1475
1476       src_base = src;
1477       consumed_chars_base = consumed_chars;
1478
1479       if (charbuf + 2 >= charbuf_end)
1480         break;
1481
1482       ONE_MORE_BYTE (c1);
1483       if (c1 < 0)
1484         {
1485           *charbuf++ = -c1;
1486           continue;
1487         }
1488       ONE_MORE_BYTE (c2);
1489       if (c2 < 0)
1490         {
1491           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1492           *charbuf++ = -c2;
1493           continue;
1494         }
1495       c = (endian == utf_16_big_endian
1496            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1497       if (surrogate)
1498         {
1499           if (! UTF_16_LOW_SURROGATE_P (c))
1500             {
1501               if (endian == utf_16_big_endian)
1502                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1503               else
1504                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1505               *charbuf++ = c1;
1506               *charbuf++ = c2;
1507               coding->errors++;
1508               if (UTF_16_HIGH_SURROGATE_P (c))
1509                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1510               else
1511                 *charbuf++ = c;
1512             }
1513           else
1514             {
1515               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1516               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1517               *charbuf++ = 0x10000 + c;
1518             }
1519         }
1520       else
1521         {
1522           if (UTF_16_HIGH_SURROGATE_P (c))
1523             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1524           else
1525             *charbuf++ = c;
1526         }
1527     }
1528
1529  no_more_source:
1530   coding->consumed_char += consumed_chars_base;
1531   coding->consumed = src_base - coding->source;
1532   coding->charbuf_used = charbuf - coding->charbuf;
1533 }
1534
1535 static int
1536 encode_coding_utf_16 (coding)
1537      struct coding_system *coding;
1538 {
1539   int multibytep = coding->dst_multibyte;
1540   int *charbuf = coding->charbuf;
1541   int *charbuf_end = charbuf + coding->charbuf_used;
1542   unsigned char *dst = coding->destination + coding->produced;
1543   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1544   int safe_room = 8;
1545   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1546   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1547   int produced_chars = 0;
1548   Lisp_Object attrs, charset_list;
1549   int c;
1550
1551   CODING_GET_INFO (coding, attrs, charset_list);
1552
1553   if (bom != utf_16_without_bom)
1554     {
1555       ASSURE_DESTINATION (safe_room);
1556       if (big_endian)
1557         EMIT_TWO_BYTES (0xFE, 0xFF);
1558       else
1559         EMIT_TWO_BYTES (0xFF, 0xFE);
1560       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1561     }
1562
1563   while (charbuf < charbuf_end)
1564     {
1565       ASSURE_DESTINATION (safe_room);
1566       c = *charbuf++;
1567       if (c >= MAX_UNICODE_CHAR)
1568         c = coding->default_char;
1569
1570       if (c < 0x10000)
1571         {
1572           if (big_endian)
1573             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1574           else
1575             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1576         }
1577       else
1578         {
1579           int c1, c2;
1580
1581           c -= 0x10000;
1582           c1 = (c >> 10) + 0xD800;
1583           c2 = (c & 0x3FF) + 0xDC00;
1584           if (big_endian)
1585             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1586           else
1587             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1588         }
1589     }
1590   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1591   coding->produced = dst - coding->destination;
1592   coding->produced_char += produced_chars;
1593   return 0;
1594 }
1595
1596 \f
1597 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1598
1599 /* Emacs' internal format for representation of multiple character
1600    sets is a kind of multi-byte encoding, i.e. characters are
1601    represented by variable-length sequences of one-byte codes.
1602
1603    ASCII characters and control characters (e.g. `tab', `newline') are
1604    represented by one-byte sequences which are their ASCII codes, in
1605    the range 0x00 through 0x7F.
1606
1607    8-bit characters of the range 0x80..0x9F are represented by
1608    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1609    code + 0x20).
1610
1611    8-bit characters of the range 0xA0..0xFF are represented by
1612    one-byte sequences which are their 8-bit code.
1613
1614    The other characters are represented by a sequence of `base
1615    leading-code', optional `extended leading-code', and one or two
1616    `position-code's.  The length of the sequence is determined by the
1617    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1618    whereas extended leading-code and position-code take the range 0xA0
1619    through 0xFF.  See `charset.h' for more details about leading-code
1620    and position-code.
1621
1622    --- CODE RANGE of Emacs' internal format ---
1623    character set        range
1624    -------------        -----
1625    ascii                0x00..0x7F
1626    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1627    eight-bit-graphic    0xA0..0xBF
1628    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1629    ---------------------------------------------
1630
1631    As this is the internal character representation, the format is
1632    usually not used externally (i.e. in a file or in a data sent to a
1633    process).  But, it is possible to have a text externally in this
1634    format (i.e. by encoding by the coding system `emacs-mule').
1635
1636    In that case, a sequence of one-byte codes has a slightly different
1637    form.
1638
1639    At first, all characters in eight-bit-control are represented by
1640    one-byte sequences which are their 8-bit code.
1641
1642    Next, character composition data are represented by the byte
1643    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1644    where,
1645         METHOD is 0xF0 plus one of composition method (enum
1646         composition_method),
1647
1648         BYTES is 0xA0 plus a byte length of this composition data,
1649
1650         CHARS is 0x20 plus a number of characters composed by this
1651         data,
1652
1653         COMPONENTs are characters of multibye form or composition
1654         rules encoded by two-byte of ASCII codes.
1655
1656    In addition, for backward compatibility, the following formats are
1657    also recognized as composition data on decoding.
1658
1659    0x80 MSEQ ...
1660    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1661
1662    Here,
1663         MSEQ is a multibyte form but in these special format:
1664           ASCII: 0xA0 ASCII_CODE+0x80,
1665           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1666         RULE is a one byte code of the range 0xA0..0xF0 that
1667         represents a composition rule.
1668   */
1669
1670 char emacs_mule_bytes[256];
1671
1672 int
1673 emacs_mule_char (coding, src, nbytes, nchars, id)
1674      struct coding_system *coding;
1675      const unsigned char *src;
1676      int *nbytes, *nchars, *id;
1677 {
1678   const unsigned char *src_end = coding->source + coding->src_bytes;
1679   const unsigned char *src_base = src;
1680   int multibytep = coding->src_multibyte;
1681   struct charset *charset;
1682   unsigned code;
1683   int c;
1684   int consumed_chars = 0;
1685
1686   ONE_MORE_BYTE (c);
1687   if (c < 0)
1688     {
1689       c = -c;
1690       charset = emacs_mule_charset[0];
1691     }
1692   else
1693     {
1694       switch (emacs_mule_bytes[c])
1695         {
1696         case 2:
1697           if (! (charset = emacs_mule_charset[c]))
1698             goto invalid_code;
1699           ONE_MORE_BYTE (c);
1700           if (c < 0xA0)
1701             goto invalid_code;
1702           code = c & 0x7F;
1703           break;
1704
1705         case 3:
1706           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1707               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1708             {
1709               ONE_MORE_BYTE (c);
1710               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1711                 goto invalid_code;
1712               ONE_MORE_BYTE (c);
1713               if (c < 0xA0)
1714                 goto invalid_code;
1715               code = c & 0x7F;
1716             }
1717           else
1718             {
1719               if (! (charset = emacs_mule_charset[c]))
1720                 goto invalid_code;
1721               ONE_MORE_BYTE (c);
1722               if (c < 0xA0)
1723                 goto invalid_code;
1724               code = (c & 0x7F) << 8;
1725               ONE_MORE_BYTE (c);
1726               if (c < 0xA0)
1727                 goto invalid_code;
1728               code |= c & 0x7F;
1729             }
1730           break;
1731
1732         case 4:
1733           ONE_MORE_BYTE (c);
1734           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1735             goto invalid_code;
1736           ONE_MORE_BYTE (c);
1737           if (c < 0xA0)
1738             goto invalid_code;
1739           code = (c & 0x7F) << 8;
1740           ONE_MORE_BYTE (c);
1741           if (c < 0xA0)
1742             goto invalid_code;
1743           code |= c & 0x7F;
1744           break;
1745
1746         case 1:
1747           code = c;
1748           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1749                                      ? charset_ascii : charset_eight_bit);
1750           break;
1751
1752         default:
1753           abort ();
1754         }
1755       c = DECODE_CHAR (charset, code);
1756       if (c < 0)
1757         goto invalid_code;
1758     }
1759   *nbytes = src - src_base;
1760   *nchars = consumed_chars;
1761   if (id)
1762     *id = charset->id;
1763   return c;
1764
1765  no_more_source:
1766   return -2;
1767
1768  invalid_code:
1769   return -1;
1770 }
1771
1772
1773 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1774    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1775    else return 0.  */
1776
1777 static int
1778 detect_coding_emacs_mule (coding, detect_info)
1779      struct coding_system *coding;
1780      struct coding_detection_info *detect_info;
1781 {
1782   const unsigned char *src = coding->source, *src_base;
1783   const unsigned char *src_end = coding->source + coding->src_bytes;
1784   int multibytep = coding->src_multibyte;
1785   int consumed_chars = 0;
1786   int c;
1787   int found = 0;
1788
1789   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1790   /* A coding system of this category is always ASCII compatible.  */
1791   src += coding->head_ascii;
1792
1793   while (1)
1794     {
1795       src_base = src;
1796       ONE_MORE_BYTE (c);
1797       if (c < 0)
1798         continue;
1799       if (c == 0x80)
1800         {
1801           /* Perhaps the start of composite character.  We simple skip
1802              it because analyzing it is too heavy for detecting.  But,
1803              at least, we check that the composite character
1804              constitues of more than 4 bytes.  */
1805           const unsigned char *src_base;
1806
1807         repeat:
1808           src_base = src;
1809           do
1810             {
1811               ONE_MORE_BYTE (c);
1812             }
1813           while (c >= 0xA0);
1814
1815           if (src - src_base <= 4)
1816             break;
1817           found = CATEGORY_MASK_EMACS_MULE;
1818           if (c == 0x80)
1819             goto repeat;
1820         }
1821
1822       if (c < 0x80)
1823         {
1824           if (c < 0x20
1825               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1826             break;
1827         }
1828       else
1829         {
1830           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1831
1832           while (more_bytes > 0)
1833             {
1834               ONE_MORE_BYTE (c);
1835               if (c < 0xA0)
1836                 {
1837                   src--;        /* Unread the last byte.  */
1838                   break;
1839                 }
1840               more_bytes--;
1841             }
1842           if (more_bytes != 0)
1843             break;
1844           found = CATEGORY_MASK_EMACS_MULE;
1845         }
1846     }
1847   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1848   return 0;
1849
1850  no_more_source:
1851   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1852     {
1853       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1854       return 0;
1855     }
1856   detect_info->found |= found;
1857   return 1;
1858 }
1859
1860
1861 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1862
1863 /* Decode a character represented as a component of composition
1864    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1865    update SRC to the head of next character (or an encoded composition
1866    rule).  If SRC doesn't points a composition component, set C to -1.
1867    If SRC points an invalid byte sequence, global exit by a return
1868    value 0.  */
1869
1870 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1871   if (1)                                                        \
1872     {                                                           \
1873       int c;                                                    \
1874       int nbytes, nchars;                                       \
1875                                                                 \
1876       if (src == src_end)                                       \
1877         break;                                                  \
1878       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1879       if (c < 0)                                                \
1880         {                                                       \
1881           if (c == -2)                                          \
1882             break;                                              \
1883           goto invalid_code;                                    \
1884         }                                                       \
1885       *buf++ = c;                                               \
1886       src += nbytes;                                            \
1887       consumed_chars += nchars;                                 \
1888     }                                                           \
1889   else
1890
1891
1892 /* Decode a composition rule represented as a component of composition
1893    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1894    and increment BUF.  If SRC points an invalid byte sequence, set C
1895    to -1.  */
1896
1897 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1898   do {                                                  \
1899     int c, gref, nref;                                  \
1900                                                         \
1901     if (src >= src_end)                                 \
1902       goto invalid_code;                                \
1903     ONE_MORE_BYTE_NO_CHECK (c);                         \
1904     c -= 0x20;                                          \
1905     if (c < 0 || c >= 81)                               \
1906       goto invalid_code;                                \
1907                                                         \
1908     gref = c / 9, nref = c % 9;                         \
1909     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1910   } while (0)
1911
1912
1913 /* Decode a composition rule represented as a component of composition
1914    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1915    and increment BUF.  If SRC points an invalid byte sequence, set C
1916    to -1.  */
1917
1918 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1919   do {                                                  \
1920     int gref, nref;                                     \
1921                                                         \
1922     if (src + 1>= src_end)                              \
1923       goto invalid_code;                                \
1924     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1925     gref -= 0x20;                                       \
1926     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1927     nref -= 0x20;                                       \
1928     if (gref < 0 || gref >= 81                          \
1929         || nref < 0 || nref >= 81)                      \
1930       goto invalid_code;                                \
1931     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1932   } while (0)
1933
1934
1935 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1936   do {                                                                  \
1937     /* Emacs 21 style format.  The first three bytes at SRC are         \
1938        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1939        the byte length of this composition information, CHARS is the    \
1940        number of characters composed by this composition.  */           \
1941     enum composition_method method = c - 0xF2;                          \
1942     int *charbuf_base = charbuf;                                        \
1943     int consumed_chars_limit;                                           \
1944     int nbytes, nchars;                                                 \
1945                                                                         \
1946     ONE_MORE_BYTE (c);                                                  \
1947     if (c < 0)                                                          \
1948       goto invalid_code;                                                \
1949     nbytes = c - 0xA0;                                                  \
1950     if (nbytes < 3)                                                     \
1951       goto invalid_code;                                                \
1952     ONE_MORE_BYTE (c);                                                  \
1953     if (c < 0)                                                          \
1954       goto invalid_code;                                                \
1955     nchars = c - 0xA0;                                                  \
1956     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1957     consumed_chars_limit = consumed_chars_base + nbytes;                \
1958     if (method != COMPOSITION_RELATIVE)                                 \
1959       {                                                                 \
1960         int i = 0;                                                      \
1961         while (consumed_chars < consumed_chars_limit)                   \
1962           {                                                             \
1963             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1964               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1965             else                                                        \
1966               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1967             i++;                                                        \
1968           }                                                             \
1969         if (consumed_chars < consumed_chars_limit)                      \
1970           goto invalid_code;                                            \
1971         charbuf_base[0] -= i;                                           \
1972       }                                                                 \
1973   } while (0)
1974
1975
1976 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1977   do {                                                          \
1978     /* Emacs 20 style format for relative composition.  */      \
1979     /* Store multibyte form of characters to be composed.  */   \
1980     enum composition_method method = COMPOSITION_RELATIVE;      \
1981     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1982     int *buf = components;                                      \
1983     int i, j;                                                   \
1984                                                                 \
1985     src = src_base;                                             \
1986     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1987     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1988       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1989     if (i < 2)                                                  \
1990       goto invalid_code;                                        \
1991     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
1992     for (j = 0; j < i; j++)                                     \
1993       *charbuf++ = components[j];                               \
1994   } while (0)
1995
1996
1997 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1998   do {                                                          \
1999     /* Emacs 20 style format for rule-base composition.  */     \
2000     /* Store multibyte form of characters to be composed.  */   \
2001     enum composition_method method = COMPOSITION_WITH_RULE;     \
2002     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2003     int *buf = components;                                      \
2004     int i, j;                                                   \
2005                                                                 \
2006     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2007     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2008       {                                                         \
2009         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2010         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2011       }                                                         \
2012     if (i < 1 || (buf - components) % 2 == 0)                   \
2013       goto invalid_code;                                        \
2014     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2015       goto no_more_source;                                      \
2016     ADD_COMPOSITION_DATA (buf, i, method);                      \
2017     for (j = 0; j < i; j++)                                     \
2018       *charbuf++ = components[j];                               \
2019     for (j = 0; j < i; j += 2)                                  \
2020       *charbuf++ = components[j];                               \
2021   } while (0)
2022
2023
2024 static void
2025 decode_coding_emacs_mule (coding)
2026      struct coding_system *coding;
2027 {
2028   const unsigned char *src = coding->source + coding->consumed;
2029   const unsigned char *src_end = coding->source + coding->src_bytes;
2030   const unsigned char *src_base;
2031   int *charbuf = coding->charbuf + coding->charbuf_used;
2032   int *charbuf_end
2033     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2034   int consumed_chars = 0, consumed_chars_base;
2035   int multibytep = coding->src_multibyte;
2036   Lisp_Object attrs, charset_list;
2037   int char_offset = coding->produced_char;
2038   int last_offset = char_offset;
2039   int last_id = charset_ascii;
2040
2041   CODING_GET_INFO (coding, attrs, charset_list);
2042
2043   while (1)
2044     {
2045       int c;
2046
2047       src_base = src;
2048       consumed_chars_base = consumed_chars;
2049
2050       if (charbuf >= charbuf_end)
2051         break;
2052
2053       ONE_MORE_BYTE (c);
2054       if (c < 0)
2055         {
2056           *charbuf++ = -c;
2057           char_offset++;
2058         }
2059       else if (c < 0x80)
2060         {
2061           *charbuf++ = c;
2062           char_offset++;
2063         }
2064       else if (c == 0x80)
2065         {
2066           ONE_MORE_BYTE (c);
2067           if (c < 0)
2068             goto invalid_code;
2069           if (c - 0xF2 >= COMPOSITION_RELATIVE
2070               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2071             DECODE_EMACS_MULE_21_COMPOSITION (c);
2072           else if (c < 0xC0)
2073             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2074           else if (c == 0xFF)
2075             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2076           else
2077             goto invalid_code;
2078         }
2079       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2080         {
2081           int nbytes, nchars;
2082           int id;
2083
2084           src = src_base;
2085           consumed_chars = consumed_chars_base;
2086           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2087           if (c < 0)
2088             {
2089               if (c == -2)
2090                 break;
2091               goto invalid_code;
2092             }
2093           if (last_id != id)
2094             {
2095               if (last_id != charset_ascii)
2096                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2097               last_id = id;
2098               last_offset = char_offset;
2099             }
2100           *charbuf++ = c;
2101           src += nbytes;
2102           consumed_chars += nchars;
2103           char_offset++;
2104         }
2105       continue;
2106
2107     invalid_code:
2108       src = src_base;
2109       consumed_chars = consumed_chars_base;
2110       ONE_MORE_BYTE (c);
2111       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2112       char_offset++;
2113       coding->errors++;
2114     }
2115
2116  no_more_source:
2117   if (last_id != charset_ascii)
2118     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2119   coding->consumed_char += consumed_chars_base;
2120   coding->consumed = src_base - coding->source;
2121   coding->charbuf_used = charbuf - coding->charbuf;
2122 }
2123
2124
2125 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2126   do {                                          \
2127     if (id < 0xA0)                              \
2128       codes[0] = id, codes[1] = 0;              \
2129     else if (id < 0xE0)                         \
2130       codes[0] = 0x9A, codes[1] = id;           \
2131     else if (id < 0xF0)                         \
2132       codes[0] = 0x9B, codes[1] = id;           \
2133     else if (id < 0xF5)                         \
2134       codes[0] = 0x9C, codes[1] = id;           \
2135     else                                        \
2136       codes[0] = 0x9D, codes[1] = id;           \
2137   } while (0);
2138
2139
2140 static int
2141 encode_coding_emacs_mule (coding)
2142      struct coding_system *coding;
2143 {
2144   int multibytep = coding->dst_multibyte;
2145   int *charbuf = coding->charbuf;
2146   int *charbuf_end = charbuf + coding->charbuf_used;
2147   unsigned char *dst = coding->destination + coding->produced;
2148   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2149   int safe_room = 8;
2150   int produced_chars = 0;
2151   Lisp_Object attrs, charset_list;
2152   int c;
2153   int preferred_charset_id = -1;
2154
2155   CODING_GET_INFO (coding, attrs, charset_list);
2156   if (! EQ (charset_list, Vemacs_mule_charset_list))
2157     {
2158       CODING_ATTR_CHARSET_LIST (attrs)
2159         = charset_list = Vemacs_mule_charset_list;
2160     }
2161
2162   while (charbuf < charbuf_end)
2163     {
2164       ASSURE_DESTINATION (safe_room);
2165       c = *charbuf++;
2166
2167       if (c < 0)
2168         {
2169           /* Handle an annotation.  */
2170           switch (*charbuf)
2171             {
2172             case CODING_ANNOTATE_COMPOSITION_MASK:
2173               /* Not yet implemented.  */
2174               break;
2175             case CODING_ANNOTATE_CHARSET_MASK:
2176               preferred_charset_id = charbuf[3];
2177               if (preferred_charset_id >= 0
2178                   && NILP (Fmemq (make_number (preferred_charset_id),
2179                                   charset_list)))
2180                 preferred_charset_id = -1;
2181               break;
2182             default:
2183               abort ();
2184             }
2185           charbuf += -c - 1;
2186           continue;
2187         }
2188
2189       if (ASCII_CHAR_P (c))
2190         EMIT_ONE_ASCII_BYTE (c);
2191       else if (CHAR_BYTE8_P (c))
2192         {
2193           c = CHAR_TO_BYTE8 (c);
2194           EMIT_ONE_BYTE (c);
2195         }
2196       else
2197         {
2198           struct charset *charset;
2199           unsigned code;
2200           int dimension;
2201           int emacs_mule_id;
2202           unsigned char leading_codes[2];
2203
2204           if (preferred_charset_id >= 0)
2205             {
2206               charset = CHARSET_FROM_ID (preferred_charset_id);
2207               if (! CHAR_CHARSET_P (c, charset))
2208                 charset = char_charset (c, charset_list, NULL);
2209             }
2210           else
2211             charset = char_charset (c, charset_list, &code);
2212           if (! charset)
2213             {
2214               c = coding->default_char;
2215               if (ASCII_CHAR_P (c))
2216                 {
2217                   EMIT_ONE_ASCII_BYTE (c);
2218                   continue;
2219                 }
2220               charset = char_charset (c, charset_list, &code);
2221             }
2222           dimension = CHARSET_DIMENSION (charset);
2223           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2224           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2225           EMIT_ONE_BYTE (leading_codes[0]);
2226           if (leading_codes[1])
2227             EMIT_ONE_BYTE (leading_codes[1]);
2228           if (dimension == 1)
2229             EMIT_ONE_BYTE (code | 0x80);
2230           else
2231             {
2232               code |= 0x8080;
2233               EMIT_ONE_BYTE (code >> 8);
2234               EMIT_ONE_BYTE (code & 0xFF);
2235             }
2236         }
2237     }
2238   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2239   coding->produced_char += produced_chars;
2240   coding->produced = dst - coding->destination;
2241   return 0;
2242 }
2243
2244 \f
2245 /*** 7. ISO2022 handlers ***/
2246
2247 /* The following note describes the coding system ISO2022 briefly.
2248    Since the intention of this note is to help understand the
2249    functions in this file, some parts are NOT ACCURATE or are OVERLY
2250    SIMPLIFIED.  For thorough understanding, please refer to the
2251    original document of ISO2022.  This is equivalent to the standard
2252    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2253
2254    ISO2022 provides many mechanisms to encode several character sets
2255    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2256    is encoded using bytes less than 128.  This may make the encoded
2257    text a little bit longer, but the text passes more easily through
2258    several types of gateway, some of which strip off the MSB (Most
2259    Significant Bit).
2260
2261    There are two kinds of character sets: control character sets and
2262    graphic character sets.  The former contain control characters such
2263    as `newline' and `escape' to provide control functions (control
2264    functions are also provided by escape sequences).  The latter
2265    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2266    two control character sets and many graphic character sets.
2267
2268    Graphic character sets are classified into one of the following
2269    four classes, according to the number of bytes (DIMENSION) and
2270    number of characters in one dimension (CHARS) of the set:
2271    - DIMENSION1_CHARS94
2272    - DIMENSION1_CHARS96
2273    - DIMENSION2_CHARS94
2274    - DIMENSION2_CHARS96
2275
2276    In addition, each character set is assigned an identification tag,
2277    unique for each set, called the "final character" (denoted as <F>
2278    hereafter).  The <F> of each character set is decided by ECMA(*)
2279    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2280    (0x30..0x3F are for private use only).
2281
2282    Note (*): ECMA = European Computer Manufacturers Association
2283
2284    Here are examples of graphic character sets [NAME(<F>)]:
2285         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2286         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2287         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2288         o DIMENSION2_CHARS96 -- none for the moment
2289
2290    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2291         C0 [0x00..0x1F] -- control character plane 0
2292         GL [0x20..0x7F] -- graphic character plane 0
2293         C1 [0x80..0x9F] -- control character plane 1
2294         GR [0xA0..0xFF] -- graphic character plane 1
2295
2296    A control character set is directly designated and invoked to C0 or
2297    C1 by an escape sequence.  The most common case is that:
2298    - ISO646's  control character set is designated/invoked to C0, and
2299    - ISO6429's control character set is designated/invoked to C1,
2300    and usually these designations/invocations are omitted in encoded
2301    text.  In a 7-bit environment, only C0 can be used, and a control
2302    character for C1 is encoded by an appropriate escape sequence to
2303    fit into the environment.  All control characters for C1 are
2304    defined to have corresponding escape sequences.
2305
2306    A graphic character set is at first designated to one of four
2307    graphic registers (G0 through G3), then these graphic registers are
2308    invoked to GL or GR.  These designations and invocations can be
2309    done independently.  The most common case is that G0 is invoked to
2310    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2311    these invocations and designations are omitted in encoded text.
2312    In a 7-bit environment, only GL can be used.
2313
2314    When a graphic character set of CHARS94 is invoked to GL, codes
2315    0x20 and 0x7F of the GL area work as control characters SPACE and
2316    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2317    be used.
2318
2319    There are two ways of invocation: locking-shift and single-shift.
2320    With locking-shift, the invocation lasts until the next different
2321    invocation, whereas with single-shift, the invocation affects the
2322    following character only and doesn't affect the locking-shift
2323    state.  Invocations are done by the following control characters or
2324    escape sequences:
2325
2326    ----------------------------------------------------------------------
2327    abbrev  function                  cntrl escape seq   description
2328    ----------------------------------------------------------------------
2329    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2330    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2331    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2332    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2333    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2334    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2335    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2336    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2337    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2338    ----------------------------------------------------------------------
2339    (*) These are not used by any known coding system.
2340
2341    Control characters for these functions are defined by macros
2342    ISO_CODE_XXX in `coding.h'.
2343
2344    Designations are done by the following escape sequences:
2345    ----------------------------------------------------------------------
2346    escape sequence      description
2347    ----------------------------------------------------------------------
2348    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2349    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2350    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2351    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2352    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2353    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2354    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2355    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2356    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2357    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2358    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2359    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2360    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2361    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2362    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2363    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2364    ----------------------------------------------------------------------
2365
2366    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2367    of dimension 1, chars 94, and final character <F>, etc...
2368
2369    Note (*): Although these designations are not allowed in ISO2022,
2370    Emacs accepts them on decoding, and produces them on encoding
2371    CHARS96 character sets in a coding system which is characterized as
2372    7-bit environment, non-locking-shift, and non-single-shift.
2373
2374    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2375    '(' must be omitted.  We refer to this as "short-form" hereafter.
2376
2377    Now you may notice that there are a lot of ways of encoding the
2378    same multilingual text in ISO2022.  Actually, there exist many
2379    coding systems such as Compound Text (used in X11's inter client
2380    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2381    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2382    localized platforms), and all of these are variants of ISO2022.
2383
2384    In addition to the above, Emacs handles two more kinds of escape
2385    sequences: ISO6429's direction specification and Emacs' private
2386    sequence for specifying character composition.
2387
2388    ISO6429's direction specification takes the following form:
2389         o CSI ']'      -- end of the current direction
2390         o CSI '0' ']'  -- end of the current direction
2391         o CSI '1' ']'  -- start of left-to-right text
2392         o CSI '2' ']'  -- start of right-to-left text
2393    The control character CSI (0x9B: control sequence introducer) is
2394    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2395
2396    Character composition specification takes the following form:
2397         o ESC '0' -- start relative composition
2398         o ESC '1' -- end composition
2399         o ESC '2' -- start rule-base composition (*)
2400         o ESC '3' -- start relative composition with alternate chars  (**)
2401         o ESC '4' -- start rule-base composition with alternate chars  (**)
2402   Since these are not standard escape sequences of any ISO standard,
2403   the use of them with these meanings is restricted to Emacs only.
2404
2405   (*) This form is used only in Emacs 20.7 and older versions,
2406   but newer versions can safely decode it.
2407   (**) This form is used only in Emacs 21.1 and newer versions,
2408   and older versions can't decode it.
2409
2410   Here's a list of example usages of these composition escape
2411   sequences (categorized by `enum composition_method').
2412
2413   COMPOSITION_RELATIVE:
2414         ESC 0 CHAR [ CHAR ] ESC 1
2415   COMPOSITION_WITH_RULE:
2416         ESC 2 CHAR [ RULE CHAR ] ESC 1
2417   COMPOSITION_WITH_ALTCHARS:
2418         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2419   COMPOSITION_WITH_RULE_ALTCHARS:
2420         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2421
2422 enum iso_code_class_type iso_code_class[256];
2423
2424 #define SAFE_CHARSET_P(coding, id)      \
2425   ((id) <= (coding)->max_charset_id     \
2426    && (coding)->safe_charsets[id] >= 0)
2427
2428
2429 #define SHIFT_OUT_OK(category)  \
2430   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2431
2432 static void
2433 setup_iso_safe_charsets (attrs)
2434      Lisp_Object attrs;
2435 {
2436   Lisp_Object charset_list, safe_charsets;
2437   Lisp_Object request;
2438   Lisp_Object reg_usage;
2439   Lisp_Object tail;
2440   int reg94, reg96;
2441   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2442   int max_charset_id;
2443
2444   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2445   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2446       && ! EQ (charset_list, Viso_2022_charset_list))
2447     {
2448       CODING_ATTR_CHARSET_LIST (attrs)
2449         = charset_list = Viso_2022_charset_list;
2450       ASET (attrs, coding_attr_safe_charsets, Qnil);
2451     }
2452
2453   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2454     return;
2455
2456   max_charset_id = 0;
2457   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2458     {
2459       int id = XINT (XCAR (tail));
2460       if (max_charset_id < id)
2461         max_charset_id = id;
2462     }
2463
2464   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2465                                 make_number (255));
2466   request = AREF (attrs, coding_attr_iso_request);
2467   reg_usage = AREF (attrs, coding_attr_iso_usage);
2468   reg94 = XINT (XCAR (reg_usage));
2469   reg96 = XINT (XCDR (reg_usage));
2470
2471   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2472     {
2473       Lisp_Object id;
2474       Lisp_Object reg;
2475       struct charset *charset;
2476
2477       id = XCAR (tail);
2478       charset = CHARSET_FROM_ID (XINT (id));
2479       reg = Fcdr (Fassq (id, request));
2480       if (! NILP (reg))
2481         SSET (safe_charsets, XINT (id), XINT (reg));
2482       else if (charset->iso_chars_96)
2483         {
2484           if (reg96 < 4)
2485             SSET (safe_charsets, XINT (id), reg96);
2486         }
2487       else
2488         {
2489           if (reg94 < 4)
2490             SSET (safe_charsets, XINT (id), reg94);
2491         }
2492     }
2493   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2494 }
2495
2496
2497 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2498    Check if a text is encoded in one of ISO-2022 based codig systems.
2499    If it is, return 1, else return 0.  */
2500
2501 static int
2502 detect_coding_iso_2022 (coding, detect_info)
2503      struct coding_system *coding;
2504      struct coding_detection_info *detect_info;
2505 {
2506   const unsigned char *src = coding->source, *src_base = src;
2507   const unsigned char *src_end = coding->source + coding->src_bytes;
2508   int multibytep = coding->src_multibyte;
2509   int single_shifting = 0;
2510   int id;
2511   int c, c1;
2512   int consumed_chars = 0;
2513   int i;
2514   int rejected = 0;
2515   int found = 0;
2516
2517   detect_info->checked |= CATEGORY_MASK_ISO;
2518
2519   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2520     {
2521       struct coding_system *this = &(coding_categories[i]);
2522       Lisp_Object attrs, val;
2523
2524       attrs = CODING_ID_ATTRS (this->id);
2525       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2526           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2527         setup_iso_safe_charsets (attrs);
2528       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2529       this->max_charset_id = SCHARS (val) - 1;
2530       this->safe_charsets = (char *) SDATA (val);
2531     }
2532
2533   /* A coding system of this category is always ASCII compatible.  */
2534   src += coding->head_ascii;
2535
2536   while (rejected != CATEGORY_MASK_ISO)
2537     {
2538       src_base = src;
2539       ONE_MORE_BYTE (c);
2540       switch (c)
2541         {
2542         case ISO_CODE_ESC:
2543           if (inhibit_iso_escape_detection)
2544             break;
2545           single_shifting = 0;
2546           ONE_MORE_BYTE (c);
2547           if (c >= '(' && c <= '/')
2548             {
2549               /* Designation sequence for a charset of dimension 1.  */
2550               ONE_MORE_BYTE (c1);
2551               if (c1 < ' ' || c1 >= 0x80
2552                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2553                 /* Invalid designation sequence.  Just ignore.  */
2554                 break;
2555             }
2556           else if (c == '$')
2557             {
2558               /* Designation sequence for a charset of dimension 2.  */
2559               ONE_MORE_BYTE (c);
2560               if (c >= '@' && c <= 'B')
2561                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2562                 id = iso_charset_table[1][0][c];
2563               else if (c >= '(' && c <= '/')
2564                 {
2565                   ONE_MORE_BYTE (c1);
2566                   if (c1 < ' ' || c1 >= 0x80
2567                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2568                     /* Invalid designation sequence.  Just ignore.  */
2569                     break;
2570                 }
2571               else
2572                 /* Invalid designation sequence.  Just ignore it.  */
2573                 break;
2574             }
2575           else if (c == 'N' || c == 'O')
2576             {
2577               /* ESC <Fe> for SS2 or SS3.  */
2578               single_shifting = 1;
2579               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2580               break;
2581             }
2582           else if (c >= '0' && c <= '4')
2583             {
2584               /* ESC <Fp> for start/end composition.  */
2585               found |= CATEGORY_MASK_ISO;
2586               break;
2587             }
2588           else
2589             {
2590               /* Invalid escape sequence.  Just ignore it.  */
2591               break;
2592             }
2593
2594           /* We found a valid designation sequence for CHARSET.  */
2595           rejected |= CATEGORY_MASK_ISO_8BIT;
2596           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2597                               id))
2598             found |= CATEGORY_MASK_ISO_7;
2599           else
2600             rejected |= CATEGORY_MASK_ISO_7;
2601           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2602                               id))
2603             found |= CATEGORY_MASK_ISO_7_TIGHT;
2604           else
2605             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2606           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2607                               id))
2608             found |= CATEGORY_MASK_ISO_7_ELSE;
2609           else
2610             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2611           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2612                               id))
2613             found |= CATEGORY_MASK_ISO_8_ELSE;
2614           else
2615             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2616           break;
2617
2618         case ISO_CODE_SO:
2619         case ISO_CODE_SI:
2620           /* Locking shift out/in.  */
2621           if (inhibit_iso_escape_detection)
2622             break;
2623           single_shifting = 0;
2624           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2625           found |= CATEGORY_MASK_ISO_ELSE;
2626           break;
2627
2628         case ISO_CODE_CSI:
2629           /* Control sequence introducer.  */
2630           single_shifting = 0;
2631           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2632           found |= CATEGORY_MASK_ISO_8_ELSE;
2633           goto check_extra_latin;
2634
2635         case ISO_CODE_SS2:
2636         case ISO_CODE_SS3:
2637           /* Single shift.   */
2638           if (inhibit_iso_escape_detection)
2639             break;
2640           single_shifting = 0;
2641           rejected |= CATEGORY_MASK_ISO_7BIT;
2642           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2643               & CODING_ISO_FLAG_SINGLE_SHIFT)
2644             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2645           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2646               & CODING_ISO_FLAG_SINGLE_SHIFT)
2647             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2648           if (single_shifting)
2649             break;
2650           goto check_extra_latin;
2651
2652         default:
2653           if (c < 0)
2654             continue;
2655           if (c < 0x80)
2656             {
2657               single_shifting = 0;
2658               break;
2659             }
2660           if (c >= 0xA0)
2661             {
2662               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2663               found |= CATEGORY_MASK_ISO_8_1;
2664               /* Check the length of succeeding codes of the range
2665                  0xA0..0FF.  If the byte length is even, we include
2666                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2667                  only when we are not single shifting.  */
2668               if (! single_shifting
2669                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2670                 {
2671                   int i = 1;
2672                   while (src < src_end)
2673                     {
2674                       ONE_MORE_BYTE (c);
2675                       if (c < 0xA0)
2676                         break;
2677                       i++;
2678                     }
2679
2680                   if (i & 1 && src < src_end)
2681                     rejected |= CATEGORY_MASK_ISO_8_2;
2682                   else
2683                     found |= CATEGORY_MASK_ISO_8_2;
2684                 }
2685               break;
2686             }
2687         check_extra_latin:
2688           single_shifting = 0;
2689           if (! VECTORP (Vlatin_extra_code_table)
2690               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2691             {
2692               rejected = CATEGORY_MASK_ISO;
2693               break;
2694             }
2695           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2696               & CODING_ISO_FLAG_LATIN_EXTRA)
2697             found |= CATEGORY_MASK_ISO_8_1;
2698           else
2699             rejected |= CATEGORY_MASK_ISO_8_1;
2700           rejected |= CATEGORY_MASK_ISO_8_2;
2701         }
2702     }
2703   detect_info->rejected |= CATEGORY_MASK_ISO;
2704   return 0;
2705
2706  no_more_source:
2707   detect_info->rejected |= rejected;
2708   detect_info->found |= (found & ~rejected);
2709   return 1;
2710 }
2711
2712
2713 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2714    escape sequence should be kept.  */
2715 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2716   do {                                                                  \
2717     int id, prev;                                                       \
2718                                                                         \
2719     if (final < '0' || final >= 128                                     \
2720         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2721         || !SAFE_CHARSET_P (coding, id))                                \
2722       {                                                                 \
2723         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2724         chars_96 = -1;                                                  \
2725         break;                                                          \
2726       }                                                                 \
2727     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2728     if (id == charset_jisx0201_roman)                                   \
2729       {                                                                 \
2730         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2731           id = charset_ascii;                                           \
2732       }                                                                 \
2733     else if (id == charset_jisx0208_1978)                               \
2734       {                                                                 \
2735         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2736           id = charset_jisx0208;                                        \
2737       }                                                                 \
2738     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2739     /* If there was an invalid designation to REG previously, and this  \
2740        designation is ASCII to REG, we should keep this designation     \
2741        sequence.  */                                                    \
2742     if (prev == -2 && id == charset_ascii)                              \
2743       chars_96 = -1;                                                    \
2744   } while (0)
2745
2746
2747 #define MAYBE_FINISH_COMPOSITION()                              \
2748   do {                                                          \
2749     int i;                                                      \
2750     if (composition_state == COMPOSING_NO)                      \
2751       break;                                                    \
2752     /* It is assured that we have enough room for producing     \
2753        characters stored in the table `components'.  */         \
2754     if (charbuf + component_idx > charbuf_end)                  \
2755       goto no_more_source;                                      \
2756     composition_state = COMPOSING_NO;                           \
2757     if (method == COMPOSITION_RELATIVE                          \
2758         || method == COMPOSITION_WITH_ALTCHARS)                 \
2759       {                                                         \
2760         for (i = 0; i < component_idx; i++)                     \
2761           *charbuf++ = components[i];                           \
2762         char_offset += component_idx;                           \
2763       }                                                         \
2764     else                                                        \
2765       {                                                         \
2766         for (i = 0; i < component_idx; i += 2)                  \
2767           *charbuf++ = components[i];                           \
2768         char_offset += (component_idx / 2) + 1;                 \
2769       }                                                         \
2770   } while (0)
2771
2772
2773 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2774    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2775    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2776    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2777    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2778   */
2779
2780 #define DECODE_COMPOSITION_START(c1)                                    \
2781   do {                                                                  \
2782     if (c1 == '0'                                                       \
2783         && composition_state == COMPOSING_COMPONENT_RULE)               \
2784       {                                                                 \
2785         component_len = component_idx;                                  \
2786         composition_state = COMPOSING_CHAR;                             \
2787       }                                                                 \
2788     else                                                                \
2789       {                                                                 \
2790         const unsigned char *p;                                         \
2791                                                                         \
2792         MAYBE_FINISH_COMPOSITION ();                                    \
2793         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2794           goto no_more_source;                                          \
2795         for (p = src; p < src_end - 1; p++)                             \
2796           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2797             break;                                                      \
2798         if (p == src_end - 1)                                           \
2799           {                                                             \
2800             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2801               goto invalid_code;                                        \
2802             goto no_more_source;                                        \
2803           }                                                             \
2804                                                                         \
2805         /* This is surely the start of a composition.  */               \
2806         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2807                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2808                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2809                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2810         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2811                              : COMPOSING_COMPONENT_CHAR);               \
2812         component_idx = component_len = 0;                              \
2813       }                                                                 \
2814   } while (0)
2815
2816
2817 /* Handle compositoin end sequence ESC 1.  */
2818
2819 #define DECODE_COMPOSITION_END()                                        \
2820   do {                                                                  \
2821     int nchars = (component_len > 0 ? component_idx - component_len     \
2822                   : method == COMPOSITION_RELATIVE ? component_idx      \
2823                   : (component_idx + 1) / 2);                           \
2824     int i;                                                              \
2825     int *saved_charbuf = charbuf;                                       \
2826                                                                         \
2827     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2828     if (method != COMPOSITION_RELATIVE)                                 \
2829       {                                                                 \
2830         if (component_len == 0)                                         \
2831           for (i = 0; i < component_idx; i++)                           \
2832             *charbuf++ = components[i];                                 \
2833         else                                                            \
2834           for (i = 0; i < component_len; i++)                           \
2835             *charbuf++ = components[i];                                 \
2836         *saved_charbuf = saved_charbuf - charbuf;                       \
2837       }                                                                 \
2838     if (method == COMPOSITION_WITH_RULE)                                \
2839       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2840         *charbuf++ = components[i];                                     \
2841     else                                                                \
2842       for (i = component_len; i < component_idx; i++, char_offset++)    \
2843         *charbuf++ = components[i];                                     \
2844     coding->annotated = 1;                                              \
2845     composition_state = COMPOSING_NO;                                   \
2846   } while (0)
2847
2848
2849 /* Decode a composition rule from the byte C1 (and maybe one more byte
2850    from SRC) and store one encoded composition rule in
2851    coding->cmp_data.  */
2852
2853 #define DECODE_COMPOSITION_RULE(c1)                                     \
2854   do {                                                                  \
2855     (c1) -= 32;                                                         \
2856     if (c1 < 81)                /* old format (before ver.21) */        \
2857       {                                                                 \
2858         int gref = (c1) / 9;                                            \
2859         int nref = (c1) % 9;                                            \
2860         if (gref == 4) gref = 10;                                       \
2861         if (nref == 4) nref = 10;                                       \
2862         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2863       }                                                                 \
2864     else if (c1 < 93)           /* new format (after ver.21) */         \
2865       {                                                                 \
2866         ONE_MORE_BYTE (c2);                                             \
2867         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2868       }                                                                 \
2869     else                                                                \
2870       c1 = 0;                                                           \
2871   } while (0)
2872
2873
2874 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2875
2876 static void
2877 decode_coding_iso_2022 (coding)
2878      struct coding_system *coding;
2879 {
2880   const unsigned char *src = coding->source + coding->consumed;
2881   const unsigned char *src_end = coding->source + coding->src_bytes;
2882   const unsigned char *src_base;
2883   int *charbuf = coding->charbuf + coding->charbuf_used;
2884   int *charbuf_end
2885     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2886   int consumed_chars = 0, consumed_chars_base;
2887   int multibytep = coding->src_multibyte;
2888   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2889   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2890   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2891   int charset_id_2, charset_id_3;
2892   struct charset *charset;
2893   int c;
2894   /* For handling composition sequence.  */
2895 #define COMPOSING_NO                    0
2896 #define COMPOSING_CHAR                  1
2897 #define COMPOSING_RULE                  2
2898 #define COMPOSING_COMPONENT_CHAR        3
2899 #define COMPOSING_COMPONENT_RULE        4
2900
2901   int composition_state = COMPOSING_NO;
2902   enum composition_method method;
2903   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2904   int component_idx;
2905   int component_len;
2906   Lisp_Object attrs, charset_list;
2907   int char_offset = coding->produced_char;
2908   int last_offset = char_offset;
2909   int last_id = charset_ascii;
2910
2911   CODING_GET_INFO (coding, attrs, charset_list);
2912   setup_iso_safe_charsets (attrs);
2913
2914   while (1)
2915     {
2916       int c1, c2;
2917
2918       src_base = src;
2919       consumed_chars_base = consumed_chars;
2920
2921       if (charbuf >= charbuf_end)
2922         break;
2923
2924       ONE_MORE_BYTE (c1);
2925       if (c1 < 0)
2926         goto invalid_code;
2927
2928       /* We produce at most one character.  */
2929       switch (iso_code_class [c1])
2930         {
2931         case ISO_0x20_or_0x7F:
2932           if (composition_state != COMPOSING_NO)
2933             {
2934               if (composition_state == COMPOSING_RULE
2935                   || composition_state == COMPOSING_COMPONENT_RULE)
2936                 {
2937                   DECODE_COMPOSITION_RULE (c1);
2938                   components[component_idx++] = c1;
2939                   composition_state--;
2940                   continue;
2941                 }
2942             }
2943           if (charset_id_0 < 0
2944               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2945             /* This is SPACE or DEL.  */
2946             charset = CHARSET_FROM_ID (charset_ascii);
2947           else
2948             charset = CHARSET_FROM_ID (charset_id_0);
2949           break;
2950
2951         case ISO_graphic_plane_0:
2952           if (composition_state != COMPOSING_NO)
2953             {
2954               if (composition_state == COMPOSING_RULE
2955                   || composition_state == COMPOSING_COMPONENT_RULE)
2956                 {
2957                   DECODE_COMPOSITION_RULE (c1);
2958                   components[component_idx++] = c1;
2959                   composition_state--;
2960                   continue;
2961                 }
2962             }
2963           if (charset_id_0 < 0)
2964             charset = CHARSET_FROM_ID (charset_ascii);
2965           else
2966             charset = CHARSET_FROM_ID (charset_id_0);
2967           break;
2968
2969         case ISO_0xA0_or_0xFF:
2970           if (charset_id_1 < 0
2971               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2972               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2973             goto invalid_code;
2974           /* This is a graphic character, we fall down ... */
2975
2976         case ISO_graphic_plane_1:
2977           if (charset_id_1 < 0)
2978             goto invalid_code;
2979           charset = CHARSET_FROM_ID (charset_id_1);
2980           break;
2981
2982         case ISO_control_0:
2983           MAYBE_FINISH_COMPOSITION ();
2984           charset = CHARSET_FROM_ID (charset_ascii);
2985           break;
2986
2987         case ISO_control_1:
2988           MAYBE_FINISH_COMPOSITION ();
2989           goto invalid_code;
2990
2991         case ISO_shift_out:
2992           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2993               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2994             goto invalid_code;
2995           CODING_ISO_INVOCATION (coding, 0) = 1;
2996           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2997           continue;
2998
2999         case ISO_shift_in:
3000           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3001             goto invalid_code;
3002           CODING_ISO_INVOCATION (coding, 0) = 0;
3003           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3004           continue;
3005
3006         case ISO_single_shift_2_7:
3007         case ISO_single_shift_2:
3008           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3009             goto invalid_code;
3010           /* SS2 is handled as an escape sequence of ESC 'N' */
3011           c1 = 'N';
3012           goto label_escape_sequence;
3013
3014         case ISO_single_shift_3:
3015           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3016             goto invalid_code;
3017           /* SS2 is handled as an escape sequence of ESC 'O' */
3018           c1 = 'O';
3019           goto label_escape_sequence;
3020
3021         case ISO_control_sequence_introducer:
3022           /* CSI is handled as an escape sequence of ESC '[' ...  */
3023           c1 = '[';
3024           goto label_escape_sequence;
3025
3026         case ISO_escape:
3027           ONE_MORE_BYTE (c1);
3028         label_escape_sequence:
3029           /* Escape sequences handled here are invocation,
3030              designation, direction specification, and character
3031              composition specification.  */
3032           switch (c1)
3033             {
3034             case '&':           /* revision of following character set */
3035               ONE_MORE_BYTE (c1);
3036               if (!(c1 >= '@' && c1 <= '~'))
3037                 goto invalid_code;
3038               ONE_MORE_BYTE (c1);
3039               if (c1 != ISO_CODE_ESC)
3040                 goto invalid_code;
3041               ONE_MORE_BYTE (c1);
3042               goto label_escape_sequence;
3043
3044             case '$':           /* designation of 2-byte character set */
3045               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3046                 goto invalid_code;
3047               {
3048                 int reg, chars96;
3049
3050                 ONE_MORE_BYTE (c1);
3051                 if (c1 >= '@' && c1 <= 'B')
3052                   {     /* designation of JISX0208.1978, GB2312.1980,
3053                            or JISX0208.1980 */
3054                     reg = 0, chars96 = 0;
3055                   }
3056                 else if (c1 >= 0x28 && c1 <= 0x2B)
3057                   { /* designation of DIMENSION2_CHARS94 character set */
3058                     reg = c1 - 0x28, chars96 = 0;
3059                     ONE_MORE_BYTE (c1);
3060                   }
3061                 else if (c1 >= 0x2C && c1 <= 0x2F)
3062                   { /* designation of DIMENSION2_CHARS96 character set */
3063                     reg = c1 - 0x2C, chars96 = 1;
3064                     ONE_MORE_BYTE (c1);
3065                   }
3066                 else
3067                   goto invalid_code;
3068                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3069                 /* We must update these variables now.  */
3070                 if (reg == 0)
3071                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3072                 else if (reg == 1)
3073                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3074                 if (chars96 < 0)
3075                   goto invalid_code;
3076               }
3077               continue;
3078
3079             case 'n':           /* invocation of locking-shift-2 */
3080               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3081                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3082                 goto invalid_code;
3083               CODING_ISO_INVOCATION (coding, 0) = 2;
3084               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3085               continue;
3086
3087             case 'o':           /* invocation of locking-shift-3 */
3088               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3089                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3090                 goto invalid_code;
3091               CODING_ISO_INVOCATION (coding, 0) = 3;
3092               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3093               continue;
3094
3095             case 'N':           /* invocation of single-shift-2 */
3096               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3097                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3098                 goto invalid_code;
3099               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3100               if (charset_id_2 < 0)
3101                 charset = CHARSET_FROM_ID (charset_ascii);
3102               else
3103                 charset = CHARSET_FROM_ID (charset_id_2);
3104               ONE_MORE_BYTE (c1);
3105               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3106                 goto invalid_code;
3107               break;
3108
3109             case 'O':           /* invocation of single-shift-3 */
3110               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3111                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3112                 goto invalid_code;
3113               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3114               if (charset_id_3 < 0)
3115                 charset = CHARSET_FROM_ID (charset_ascii);
3116               else
3117                 charset = CHARSET_FROM_ID (charset_id_3);
3118               ONE_MORE_BYTE (c1);
3119               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3120                 goto invalid_code;
3121               break;
3122
3123             case '0': case '2': case '3': case '4': /* start composition */
3124               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3125                 goto invalid_code;
3126               DECODE_COMPOSITION_START (c1);
3127               continue;
3128
3129             case '1':           /* end composition */
3130               if (composition_state == COMPOSING_NO)
3131                 goto invalid_code;
3132               DECODE_COMPOSITION_END ();
3133               continue;
3134
3135             case '[':           /* specification of direction */
3136               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3137                 goto invalid_code;
3138               /* For the moment, nested direction is not supported.
3139                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3140                  left-to-right, and nozero means right-to-left.  */
3141               ONE_MORE_BYTE (c1);
3142               switch (c1)
3143                 {
3144                 case ']':       /* end of the current direction */
3145                   coding->mode &= ~CODING_MODE_DIRECTION;
3146
3147                 case '0':       /* end of the current direction */
3148                 case '1':       /* start of left-to-right direction */
3149                   ONE_MORE_BYTE (c1);
3150                   if (c1 == ']')
3151                     coding->mode &= ~CODING_MODE_DIRECTION;
3152                   else
3153                     goto invalid_code;
3154                   break;
3155
3156                 case '2':       /* start of right-to-left direction */
3157                   ONE_MORE_BYTE (c1);
3158                   if (c1 == ']')
3159                     coding->mode |= CODING_MODE_DIRECTION;
3160                   else
3161                     goto invalid_code;
3162                   break;
3163
3164                 default:
3165                   goto invalid_code;
3166                 }
3167               continue;
3168
3169             case '%':
3170               ONE_MORE_BYTE (c1);
3171               if (c1 == '/')
3172                 {
3173                   /* CTEXT extended segment:
3174                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3175                      We keep these bytes as is for the moment.
3176                      They may be decoded by post-read-conversion.  */
3177                   int dim, M, L;
3178                   int size;
3179
3180                   ONE_MORE_BYTE (dim);
3181                   ONE_MORE_BYTE (M);
3182                   ONE_MORE_BYTE (L);
3183                   size = ((M - 128) * 128) + (L - 128);
3184                   if (charbuf + 8 + size > charbuf_end)
3185                     goto break_loop;
3186                   *charbuf++ = ISO_CODE_ESC;
3187                   *charbuf++ = '%';
3188                   *charbuf++ = '/';
3189                   *charbuf++ = dim;
3190                   *charbuf++ = BYTE8_TO_CHAR (M);
3191                   *charbuf++ = BYTE8_TO_CHAR (L);
3192                   while (size-- > 0)
3193                     {
3194                       ONE_MORE_BYTE (c1);
3195                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3196                     }
3197                 }
3198               else if (c1 == 'G')
3199                 {
3200                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3201                      ESC % G --UTF-8-BYTES-- ESC % @
3202                      We keep these bytes as is for the moment.
3203                      They may be decoded by post-read-conversion.  */
3204                   int *p = charbuf;
3205
3206                   if (p + 6 > charbuf_end)
3207                     goto break_loop;
3208                   *p++ = ISO_CODE_ESC;
3209                   *p++ = '%';
3210                   *p++ = 'G';
3211                   while (p < charbuf_end)
3212                     {
3213                       ONE_MORE_BYTE (c1);
3214                       if (c1 == ISO_CODE_ESC
3215                           && src + 1 < src_end
3216                           && src[0] == '%'
3217                           && src[1] == '@')
3218                         {
3219                           src += 2;
3220                           break;
3221                         }
3222                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3223                     }
3224                   if (p + 3 > charbuf_end)
3225                     goto break_loop;
3226                   *p++ = ISO_CODE_ESC;
3227                   *p++ = '%';
3228                   *p++ = '@';
3229                   charbuf = p;
3230                 }
3231               else
3232                 goto invalid_code;
3233               continue;
3234               break;
3235
3236             default:
3237               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3238                 goto invalid_code;
3239               {
3240                 int reg, chars96;
3241
3242                 if (c1 >= 0x28 && c1 <= 0x2B)
3243                   { /* designation of DIMENSION1_CHARS94 character set */
3244                     reg = c1 - 0x28, chars96 = 0;
3245                     ONE_MORE_BYTE (c1);
3246                   }
3247                 else if (c1 >= 0x2C && c1 <= 0x2F)
3248                   { /* designation of DIMENSION1_CHARS96 character set */
3249                     reg = c1 - 0x2C, chars96 = 1;
3250                     ONE_MORE_BYTE (c1);
3251                   }
3252                 else
3253                   goto invalid_code;
3254                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3255                 /* We must update these variables now.  */
3256                 if (reg == 0)
3257                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3258                 else if (reg == 1)
3259                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3260                 if (chars96 < 0)
3261                   goto invalid_code;
3262               }
3263               continue;
3264             }
3265         }
3266
3267       if (charset->id != charset_ascii
3268           && last_id != charset->id)
3269         {
3270           if (last_id != charset_ascii)
3271             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3272           last_id = charset->id;
3273           last_offset = char_offset;
3274         }
3275
3276       /* Now we know CHARSET and 1st position code C1 of a character.
3277          Produce a decoded character while getting 2nd position code
3278          C2 if necessary.  */
3279       c1 &= 0x7F;
3280       if (CHARSET_DIMENSION (charset) > 1)
3281         {
3282           ONE_MORE_BYTE (c2);
3283           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3284             /* C2 is not in a valid range.  */
3285             goto invalid_code;
3286           c1 = (c1 << 8) | (c2 & 0x7F);
3287           if (CHARSET_DIMENSION (charset) > 2)
3288             {
3289               ONE_MORE_BYTE (c2);
3290               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3291                 /* C2 is not in a valid range.  */
3292                 goto invalid_code;
3293               c1 = (c1 << 8) | (c2 & 0x7F);
3294             }
3295         }
3296
3297       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3298       if (c < 0)
3299         {
3300           MAYBE_FINISH_COMPOSITION ();
3301           for (; src_base < src; src_base++, char_offset++)
3302             {
3303               if (ASCII_BYTE_P (*src_base))
3304                 *charbuf++ = *src_base;
3305               else
3306                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3307             }
3308         }
3309       else if (composition_state == COMPOSING_NO)
3310         {
3311           *charbuf++ = c;
3312           char_offset++;
3313         }
3314       else
3315         {
3316           components[component_idx++] = c;
3317           if (method == COMPOSITION_WITH_RULE
3318               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3319                   && composition_state == COMPOSING_COMPONENT_CHAR))
3320             composition_state++;
3321         }
3322       continue;
3323
3324     invalid_code:
3325       MAYBE_FINISH_COMPOSITION ();
3326       src = src_base;
3327       consumed_chars = consumed_chars_base;
3328       ONE_MORE_BYTE (c);
3329       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3330       char_offset++;
3331       coding->errors++;
3332       continue;
3333
3334     break_loop:
3335       break;
3336     }
3337
3338  no_more_source:
3339   if (last_id != charset_ascii)
3340     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3341   coding->consumed_char += consumed_chars_base;
3342   coding->consumed = src_base - coding->source;
3343   coding->charbuf_used = charbuf - coding->charbuf;
3344 }
3345
3346
3347 /* ISO2022 encoding stuff.  */
3348
3349 /*
3350    It is not enough to say just "ISO2022" on encoding, we have to
3351    specify more details.  In Emacs, each coding system of ISO2022
3352    variant has the following specifications:
3353         1. Initial designation to G0 thru G3.
3354         2. Allows short-form designation?
3355         3. ASCII should be designated to G0 before control characters?
3356         4. ASCII should be designated to G0 at end of line?
3357         5. 7-bit environment or 8-bit environment?
3358         6. Use locking-shift?
3359         7. Use Single-shift?
3360    And the following two are only for Japanese:
3361         8. Use ASCII in place of JIS0201-1976-Roman?
3362         9. Use JISX0208-1983 in place of JISX0208-1978?
3363    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3364    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3365    details.
3366 */
3367
3368 /* Produce codes (escape sequence) for designating CHARSET to graphic
3369    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3370    '@', 'A', or 'B' and the coding system CODING allows, produce
3371    designation sequence of short-form.  */
3372
3373 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3374   do {                                                                  \
3375     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3376     char *intermediate_char_94 = "()*+";                                \
3377     char *intermediate_char_96 = ",-./";                                \
3378     int revision = -1;                                                  \
3379     int c;                                                              \
3380                                                                         \
3381     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3382       revision = CHARSET_ISO_REVISION (charset);                        \
3383                                                                         \
3384     if (revision >= 0)                                                  \
3385       {                                                                 \
3386         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3387         EMIT_ONE_BYTE ('@' + revision);                                 \
3388       }                                                                 \
3389     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3390     if (CHARSET_DIMENSION (charset) == 1)                               \
3391       {                                                                 \
3392         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3393           c = intermediate_char_94[reg];                                \
3394         else                                                            \
3395           c = intermediate_char_96[reg];                                \
3396         EMIT_ONE_ASCII_BYTE (c);                                        \
3397       }                                                                 \
3398     else                                                                \
3399       {                                                                 \
3400         EMIT_ONE_ASCII_BYTE ('$');                                      \
3401         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3402           {                                                             \
3403             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3404                 || reg != 0                                             \
3405                 || final_char < '@' || final_char > 'B')                \
3406               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3407           }                                                             \
3408         else                                                            \
3409           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3410       }                                                                 \
3411     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3412                                                                         \
3413     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3414   } while (0)
3415
3416
3417 /* The following two macros produce codes (control character or escape
3418    sequence) for ISO2022 single-shift functions (single-shift-2 and
3419    single-shift-3).  */
3420
3421 #define ENCODE_SINGLE_SHIFT_2                                           \
3422   do {                                                                  \
3423     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3424       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3425     else                                                                \
3426       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3427     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3428   } while (0)
3429
3430
3431 #define ENCODE_SINGLE_SHIFT_3                                           \
3432   do {                                                                  \
3433     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3434       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3435     else                                                                \
3436       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3437     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3438   } while (0)
3439
3440
3441 /* The following four macros produce codes (control character or
3442    escape sequence) for ISO2022 locking-shift functions (shift-in,
3443    shift-out, locking-shift-2, and locking-shift-3).  */
3444
3445 #define ENCODE_SHIFT_IN                                 \
3446   do {                                                  \
3447     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3448     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3449   } while (0)
3450
3451
3452 #define ENCODE_SHIFT_OUT                                \
3453   do {                                                  \
3454     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3455     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3456   } while (0)
3457
3458
3459 #define ENCODE_LOCKING_SHIFT_2                          \
3460   do {                                                  \
3461     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3462     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3463   } while (0)
3464
3465
3466 #define ENCODE_LOCKING_SHIFT_3                          \
3467   do {                                                  \
3468     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3469     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3470   } while (0)
3471
3472
3473 /* Produce codes for a DIMENSION1 character whose character set is
3474    CHARSET and whose position-code is C1.  Designation and invocation
3475    sequences are also produced in advance if necessary.  */
3476
3477 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3478   do {                                                                  \
3479     int id = CHARSET_ID (charset);                                      \
3480                                                                         \
3481     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3482         && id == charset_ascii)                                         \
3483       {                                                                 \
3484         id = charset_jisx0201_roman;                                    \
3485         charset = CHARSET_FROM_ID (id);                                 \
3486       }                                                                 \
3487                                                                         \
3488     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3489       {                                                                 \
3490         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3491           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3492         else                                                            \
3493           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3494         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3495         break;                                                          \
3496       }                                                                 \
3497     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3498       {                                                                 \
3499         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3500         break;                                                          \
3501       }                                                                 \
3502     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3503       {                                                                 \
3504         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3505         break;                                                          \
3506       }                                                                 \
3507     else                                                                \
3508       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3509          must invoke it, or, at first, designate it to some graphic     \
3510          register.  Then repeat the loop to actually produce the        \
3511          character.  */                                                 \
3512       dst = encode_invocation_designation (charset, coding, dst,        \
3513                                            &produced_chars);            \
3514   } while (1)
3515
3516
3517 /* Produce codes for a DIMENSION2 character whose character set is
3518    CHARSET and whose position-codes are C1 and C2.  Designation and
3519    invocation codes are also produced in advance if necessary.  */
3520
3521 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3522   do {                                                                  \
3523     int id = CHARSET_ID (charset);                                      \
3524                                                                         \
3525     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3526         && id == charset_jisx0208)                                      \
3527       {                                                                 \
3528         id = charset_jisx0208_1978;                                     \
3529         charset = CHARSET_FROM_ID (id);                                 \
3530       }                                                                 \
3531                                                                         \
3532     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3533       {                                                                 \
3534         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3535           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3536         else                                                            \
3537           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3538         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3539         break;                                                          \
3540       }                                                                 \
3541     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3542       {                                                                 \
3543         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3544         break;                                                          \
3545       }                                                                 \
3546     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3547       {                                                                 \
3548         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3549         break;                                                          \
3550       }                                                                 \
3551     else                                                                \
3552       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3553          must invoke it, or, at first, designate it to some graphic     \
3554          register.  Then repeat the loop to actually produce the        \
3555          character.  */                                                 \
3556       dst = encode_invocation_designation (charset, coding, dst,        \
3557                                            &produced_chars);            \
3558   } while (1)
3559
3560
3561 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3562   do {                                                                     \
3563     int code = ENCODE_CHAR ((charset),(c));                                \
3564                                                                            \
3565     if (CHARSET_DIMENSION (charset) == 1)                                  \
3566       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3567     else                                                                   \
3568       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3569   } while (0)
3570
3571
3572 /* Produce designation and invocation codes at a place pointed by DST
3573    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3574    Return new DST.  */
3575
3576 unsigned char *
3577 encode_invocation_designation (charset, coding, dst, p_nchars)
3578      struct charset *charset;
3579      struct coding_system *coding;
3580      unsigned char *dst;
3581      int *p_nchars;
3582 {
3583   int multibytep = coding->dst_multibyte;
3584   int produced_chars = *p_nchars;
3585   int reg;                      /* graphic register number */
3586   int id = CHARSET_ID (charset);
3587
3588   /* At first, check designations.  */
3589   for (reg = 0; reg < 4; reg++)
3590     if (id == CODING_ISO_DESIGNATION (coding, reg))
3591       break;
3592
3593   if (reg >= 4)
3594     {
3595       /* CHARSET is not yet designated to any graphic registers.  */
3596       /* At first check the requested designation.  */
3597       reg = CODING_ISO_REQUEST (coding, id);
3598       if (reg < 0)
3599         /* Since CHARSET requests no special designation, designate it
3600            to graphic register 0.  */
3601         reg = 0;
3602
3603       ENCODE_DESIGNATION (charset, reg, coding);
3604     }
3605
3606   if (CODING_ISO_INVOCATION (coding, 0) != reg
3607       && CODING_ISO_INVOCATION (coding, 1) != reg)
3608     {
3609       /* Since the graphic register REG is not invoked to any graphic
3610          planes, invoke it to graphic plane 0.  */
3611       switch (reg)
3612         {
3613         case 0:                 /* graphic register 0 */
3614           ENCODE_SHIFT_IN;
3615           break;
3616
3617         case 1:                 /* graphic register 1 */
3618           ENCODE_SHIFT_OUT;
3619           break;
3620
3621         case 2:                 /* graphic register 2 */
3622           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3623             ENCODE_SINGLE_SHIFT_2;
3624           else
3625             ENCODE_LOCKING_SHIFT_2;
3626           break;
3627
3628         case 3:                 /* graphic register 3 */
3629           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3630             ENCODE_SINGLE_SHIFT_3;
3631           else
3632             ENCODE_LOCKING_SHIFT_3;
3633           break;
3634         }
3635     }
3636
3637   *p_nchars = produced_chars;
3638   return dst;
3639 }
3640
3641 /* The following three macros produce codes for indicating direction
3642    of text.  */
3643 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3644   do {                                                                  \
3645     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3646       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3647     else                                                                \
3648       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3649   } while (0)
3650
3651
3652 #define ENCODE_DIRECTION_R2L()                  \
3653   do {                                          \
3654     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3655     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3656   } while (0)
3657
3658
3659 #define ENCODE_DIRECTION_L2R()                  \
3660   do {                                          \
3661     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3662     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3663   } while (0)
3664
3665
3666 /* Produce codes for designation and invocation to reset the graphic
3667    planes and registers to initial state.  */
3668 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3669   do {                                                                  \
3670     int reg;                                                            \
3671     struct charset *charset;                                            \
3672                                                                         \
3673     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3674       ENCODE_SHIFT_IN;                                                  \
3675     for (reg = 0; reg < 4; reg++)                                       \
3676       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3677           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3678               != CODING_ISO_INITIAL (coding, reg)))                     \
3679         {                                                               \
3680           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3681           ENCODE_DESIGNATION (charset, reg, coding);                    \
3682         }                                                               \
3683   } while (0)
3684
3685
3686 /* Produce designation sequences of charsets in the line started from
3687    SRC to a place pointed by DST, and return updated DST.
3688
3689    If the current block ends before any end-of-line, we may fail to
3690    find all the necessary designations.  */
3691
3692 static unsigned char *
3693 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3694      struct coding_system *coding;
3695      int *charbuf, *charbuf_end;
3696      unsigned char *dst;
3697 {
3698   struct charset *charset;
3699   /* Table of charsets to be designated to each graphic register.  */
3700   int r[4];
3701   int c, found = 0, reg;
3702   int produced_chars = 0;
3703   int multibytep = coding->dst_multibyte;
3704   Lisp_Object attrs;
3705   Lisp_Object charset_list;
3706
3707   attrs = CODING_ID_ATTRS (coding->id);
3708   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3709   if (EQ (charset_list, Qiso_2022))
3710     charset_list = Viso_2022_charset_list;
3711
3712   for (reg = 0; reg < 4; reg++)
3713     r[reg] = -1;
3714
3715   while (found < 4)
3716     {
3717       int id;
3718
3719       c = *charbuf++;
3720       if (c == '\n')
3721         break;
3722       charset = char_charset (c, charset_list, NULL);
3723       id = CHARSET_ID (charset);
3724       reg = CODING_ISO_REQUEST (coding, id);
3725       if (reg >= 0 && r[reg] < 0)
3726         {
3727           found++;
3728           r[reg] = id;
3729         }
3730     }
3731
3732   if (found)
3733     {
3734       for (reg = 0; reg < 4; reg++)
3735         if (r[reg] >= 0
3736             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3737           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3738     }
3739
3740   return dst;
3741 }
3742
3743 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3744
3745 static int
3746 encode_coding_iso_2022 (coding)
3747      struct coding_system *coding;
3748 {
3749   int multibytep = coding->dst_multibyte;
3750   int *charbuf = coding->charbuf;
3751   int *charbuf_end = charbuf + coding->charbuf_used;
3752   unsigned char *dst = coding->destination + coding->produced;
3753   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3754   int safe_room = 16;
3755   int bol_designation
3756     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3757        && CODING_ISO_BOL (coding));
3758   int produced_chars = 0;
3759   Lisp_Object attrs, eol_type, charset_list;
3760   int ascii_compatible;
3761   int c;
3762   int preferred_charset_id = -1;
3763
3764   CODING_GET_INFO (coding, attrs, charset_list);
3765   eol_type = CODING_ID_EOL_TYPE (coding->id);
3766   if (VECTORP (eol_type))
3767     eol_type = Qunix;
3768
3769   setup_iso_safe_charsets (attrs);
3770   /* Charset list may have been changed.  */
3771   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3772   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3773
3774   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3775
3776   while (charbuf < charbuf_end)
3777     {
3778       ASSURE_DESTINATION (safe_room);
3779
3780       if (bol_designation)
3781         {
3782           unsigned char *dst_prev = dst;
3783
3784           /* We have to produce designation sequences if any now.  */
3785           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3786           bol_designation = 0;
3787           /* We are sure that designation sequences are all ASCII bytes.  */
3788           produced_chars += dst - dst_prev;
3789         }
3790
3791       c = *charbuf++;
3792
3793       if (c < 0)
3794         {
3795           /* Handle an annotation.  */
3796           switch (*charbuf)
3797             {
3798             case CODING_ANNOTATE_COMPOSITION_MASK:
3799               /* Not yet implemented.  */
3800               break;
3801             case CODING_ANNOTATE_CHARSET_MASK:
3802               preferred_charset_id = charbuf[3];
3803               if (preferred_charset_id >= 0
3804                   && NILP (Fmemq (make_number (preferred_charset_id),
3805                                   charset_list)))
3806                 preferred_charset_id = -1;
3807               break;
3808             default:
3809               abort ();
3810             }
3811           charbuf += -c - 1;
3812           continue;
3813         }
3814
3815       /* Now encode the character C.  */
3816       if (c < 0x20 || c == 0x7F)
3817         {
3818           if (c == '\n'
3819               || (c == '\r' && EQ (eol_type, Qmac)))
3820             {
3821               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3822                 ENCODE_RESET_PLANE_AND_REGISTER ();
3823               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3824                 {
3825                   int i;
3826
3827                   for (i = 0; i < 4; i++)
3828                     CODING_ISO_DESIGNATION (coding, i)
3829                       = CODING_ISO_INITIAL (coding, i);
3830                 }
3831               bol_designation
3832                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3833             }
3834           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3835             ENCODE_RESET_PLANE_AND_REGISTER ();
3836           EMIT_ONE_ASCII_BYTE (c);
3837         }
3838       else if (ASCII_CHAR_P (c))
3839         {
3840           if (ascii_compatible)
3841             EMIT_ONE_ASCII_BYTE (c);
3842           else
3843             {
3844               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3845               ENCODE_ISO_CHARACTER (charset, c);
3846             }
3847         }
3848       else if (CHAR_BYTE8_P (c))
3849         {
3850           c = CHAR_TO_BYTE8 (c);
3851           EMIT_ONE_BYTE (c);
3852         }
3853       else
3854         {
3855           struct charset *charset;
3856
3857           if (preferred_charset_id >= 0)
3858             {
3859               charset = CHARSET_FROM_ID (preferred_charset_id);
3860               if (! CHAR_CHARSET_P (c, charset))
3861                 charset = char_charset (c, charset_list, NULL);
3862             }
3863           else
3864             charset = char_charset (c, charset_list, NULL);
3865           if (!charset)
3866             {
3867               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3868                 {
3869                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3870                   charset = CHARSET_FROM_ID (charset_ascii);
3871                 }
3872               else
3873                 {
3874                   c = coding->default_char;
3875                   charset = char_charset (c, charset_list, NULL);
3876                 }
3877             }
3878           ENCODE_ISO_CHARACTER (charset, c);
3879         }
3880     }
3881
3882   if (coding->mode & CODING_MODE_LAST_BLOCK
3883       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3884     {
3885       ASSURE_DESTINATION (safe_room);
3886       ENCODE_RESET_PLANE_AND_REGISTER ();
3887     }
3888   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3889   CODING_ISO_BOL (coding) = bol_designation;
3890   coding->produced_char += produced_chars;
3891   coding->produced = dst - coding->destination;
3892   return 0;
3893 }
3894
3895 \f
3896 /*** 8,9. SJIS and BIG5 handlers ***/
3897
3898 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3899    quite widely.  So, for the moment, Emacs supports them in the bare
3900    C code.  But, in the future, they may be supported only by CCL.  */
3901
3902 /* SJIS is a coding system encoding three character sets: ASCII, right
3903    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3904    as is.  A character of charset katakana-jisx0201 is encoded by
3905    "position-code + 0x80".  A character of charset japanese-jisx0208
3906    is encoded in 2-byte but two position-codes are divided and shifted
3907    so that it fit in the range below.
3908
3909    --- CODE RANGE of SJIS ---
3910    (character set)      (range)
3911    ASCII                0x00 .. 0x7F
3912    KATAKANA-JISX0201    0xA0 .. 0xDF
3913    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3914             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3915    -------------------------------
3916
3917 */
3918
3919 /* BIG5 is a coding system encoding two character sets: ASCII and
3920    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3921    character set and is encoded in two-byte.
3922
3923    --- CODE RANGE of BIG5 ---
3924    (character set)      (range)
3925    ASCII                0x00 .. 0x7F
3926    Big5 (1st byte)      0xA1 .. 0xFE
3927         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3928    --------------------------
3929
3930   */
3931
3932 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3933    Check if a text is encoded in SJIS.  If it is, return
3934    CATEGORY_MASK_SJIS, else return 0.  */
3935
3936 static int
3937 detect_coding_sjis (coding, detect_info)
3938      struct coding_system *coding;
3939      struct coding_detection_info *detect_info;
3940 {
3941   const unsigned char *src = coding->source, *src_base;
3942   const unsigned char *src_end = coding->source + coding->src_bytes;
3943   int multibytep = coding->src_multibyte;
3944   int consumed_chars = 0;
3945   int found = 0;
3946   int c;
3947
3948   detect_info->checked |= CATEGORY_MASK_SJIS;
3949   /* A coding system of this category is always ASCII compatible.  */
3950   src += coding->head_ascii;
3951
3952   while (1)
3953     {
3954       src_base = src;
3955       ONE_MORE_BYTE (c);
3956       if (c < 0x80)
3957         continue;
3958       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3959         {
3960           ONE_MORE_BYTE (c);
3961           if (c < 0x40 || c == 0x7F || c > 0xFC)
3962             break;
3963           found = CATEGORY_MASK_SJIS;
3964         }
3965       else if (c >= 0xA0 && c < 0xE0)
3966         found = CATEGORY_MASK_SJIS;
3967       else
3968         break;
3969     }
3970   detect_info->rejected |= CATEGORY_MASK_SJIS;
3971   return 0;
3972
3973  no_more_source:
3974   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3975     {
3976       detect_info->rejected |= CATEGORY_MASK_SJIS;
3977       return 0;
3978     }
3979   detect_info->found |= found;
3980   return 1;
3981 }
3982
3983 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3984    Check if a text is encoded in BIG5.  If it is, return
3985    CATEGORY_MASK_BIG5, else return 0.  */
3986
3987 static int
3988 detect_coding_big5 (coding, detect_info)
3989      struct coding_system *coding;
3990      struct coding_detection_info *detect_info;
3991 {
3992   const unsigned char *src = coding->source, *src_base;
3993   const unsigned char *src_end = coding->source + coding->src_bytes;
3994   int multibytep = coding->src_multibyte;
3995   int consumed_chars = 0;
3996   int found = 0;
3997   int c;
3998
3999   detect_info->checked |= CATEGORY_MASK_BIG5;
4000   /* A coding system of this category is always ASCII compatible.  */
4001   src += coding->head_ascii;
4002
4003   while (1)
4004     {
4005       src_base = src;
4006       ONE_MORE_BYTE (c);
4007       if (c < 0x80)
4008         continue;
4009       if (c >= 0xA1)
4010         {
4011           ONE_MORE_BYTE (c);
4012           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4013             return 0;
4014           found = CATEGORY_MASK_BIG5;
4015         }
4016       else
4017         break;
4018     }
4019   detect_info->rejected |= CATEGORY_MASK_BIG5;
4020   return 0;
4021
4022  no_more_source:
4023   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4024     {
4025       detect_info->rejected |= CATEGORY_MASK_BIG5;
4026       return 0;
4027     }
4028   detect_info->found |= found;
4029   return 1;
4030 }
4031
4032 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4033    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4034
4035 static void
4036 decode_coding_sjis (coding)
4037      struct coding_system *coding;
4038 {
4039   const unsigned char *src = coding->source + coding->consumed;
4040   const unsigned char *src_end = coding->source + coding->src_bytes;
4041   const unsigned char *src_base;
4042   int *charbuf = coding->charbuf + coding->charbuf_used;
4043   int *charbuf_end
4044     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4045   int consumed_chars = 0, consumed_chars_base;
4046   int multibytep = coding->src_multibyte;
4047   struct charset *charset_roman, *charset_kanji, *charset_kana;
4048   struct charset *charset_kanji2;
4049   Lisp_Object attrs, charset_list, val;
4050   int char_offset = coding->produced_char;
4051   int last_offset = char_offset;
4052   int last_id = charset_ascii;
4053
4054   CODING_GET_INFO (coding, attrs, charset_list);
4055
4056   val = charset_list;
4057   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4058   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4059   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4060   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4061
4062   while (1)
4063     {
4064       int c, c1;
4065       struct charset *charset;
4066
4067       src_base = src;
4068       consumed_chars_base = consumed_chars;
4069
4070       if (charbuf >= charbuf_end)
4071         break;
4072
4073       ONE_MORE_BYTE (c);
4074       if (c < 0)
4075         goto invalid_code;
4076       if (c < 0x80)
4077         charset = charset_roman;
4078       else if (c == 0x80 || c == 0xA0)
4079         goto invalid_code;
4080       else if (c >= 0xA1 && c <= 0xDF)
4081         {
4082           /* SJIS -> JISX0201-Kana */
4083           c &= 0x7F;
4084           charset = charset_kana;
4085         }
4086       else if (c <= 0xEF)
4087         {
4088           /* SJIS -> JISX0208 */
4089           ONE_MORE_BYTE (c1);
4090           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4091             goto invalid_code;
4092           c = (c << 8) | c1;
4093           SJIS_TO_JIS (c);
4094           charset = charset_kanji;
4095         }
4096       else if (c <= 0xFC && charset_kanji2)
4097         {
4098           /* SJIS -> JISX0213-2 */
4099           ONE_MORE_BYTE (c1);
4100           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4101             goto invalid_code;
4102           c = (c << 8) | c1;
4103           SJIS_TO_JIS2 (c);
4104           charset = charset_kanji2;
4105         }
4106       else
4107         goto invalid_code;
4108       if (charset->id != charset_ascii
4109           && last_id != charset->id)
4110         {
4111           if (last_id != charset_ascii)
4112             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4113           last_id = charset->id;
4114           last_offset = char_offset;
4115         }
4116       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4117       *charbuf++ = c;
4118       char_offset++;
4119       continue;
4120
4121     invalid_code:
4122       src = src_base;
4123       consumed_chars = consumed_chars_base;
4124       ONE_MORE_BYTE (c);
4125       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4126       char_offset++;
4127       coding->errors++;
4128     }
4129
4130  no_more_source:
4131   if (last_id != charset_ascii)
4132     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4133   coding->consumed_char += consumed_chars_base;
4134   coding->consumed = src_base - coding->source;
4135   coding->charbuf_used = charbuf - coding->charbuf;
4136 }
4137
4138 static void
4139 decode_coding_big5 (coding)
4140      struct coding_system *coding;
4141 {
4142   const unsigned char *src = coding->source + coding->consumed;
4143   const unsigned char *src_end = coding->source + coding->src_bytes;
4144   const unsigned char *src_base;
4145   int *charbuf = coding->charbuf + coding->charbuf_used;
4146   int *charbuf_end
4147     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4148   int consumed_chars = 0, consumed_chars_base;
4149   int multibytep = coding->src_multibyte;
4150   struct charset *charset_roman, *charset_big5;
4151   Lisp_Object attrs, charset_list, val;
4152   int char_offset = coding->produced_char;
4153   int last_offset = char_offset;
4154   int last_id = charset_ascii;
4155
4156   CODING_GET_INFO (coding, attrs, charset_list);
4157   val = charset_list;
4158   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4159   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4160
4161   while (1)
4162     {
4163       int c, c1;
4164       struct charset *charset;
4165
4166       src_base = src;
4167       consumed_chars_base = consumed_chars;
4168
4169       if (charbuf >= charbuf_end)
4170         break;
4171
4172       ONE_MORE_BYTE (c);
4173
4174       if (c < 0)
4175         goto invalid_code;
4176       if (c < 0x80)
4177         charset = charset_roman;
4178       else
4179         {
4180           /* BIG5 -> Big5 */
4181           if (c < 0xA1 || c > 0xFE)
4182             goto invalid_code;
4183           ONE_MORE_BYTE (c1);
4184           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4185             goto invalid_code;
4186           c = c << 8 | c1;
4187           charset = charset_big5;
4188         }
4189       if (charset->id != charset_ascii
4190           && last_id != charset->id)
4191         {
4192           if (last_id != charset_ascii)
4193             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4194           last_id = charset->id;
4195           last_offset = char_offset;
4196         }
4197       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4198       *charbuf++ = c;
4199       char_offset++;
4200       continue;
4201
4202     invalid_code:
4203       src = src_base;
4204       consumed_chars = consumed_chars_base;
4205       ONE_MORE_BYTE (c);
4206       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4207       char_offset++;
4208       coding->errors++;
4209     }
4210
4211  no_more_source:
4212   if (last_id != charset_ascii)
4213     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4214   coding->consumed_char += consumed_chars_base;
4215   coding->consumed = src_base - coding->source;
4216   coding->charbuf_used = charbuf - coding->charbuf;
4217 }
4218
4219 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4220    This function can encode charsets `ascii', `katakana-jisx0201',
4221    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4222    are sure that all these charsets are registered as official charset
4223    (i.e. do not have extended leading-codes).  Characters of other
4224    charsets are produced without any encoding.  If SJIS_P is 1, encode
4225    SJIS text, else encode BIG5 text.  */
4226
4227 static int
4228 encode_coding_sjis (coding)
4229      struct coding_system *coding;
4230 {
4231   int multibytep = coding->dst_multibyte;
4232   int *charbuf = coding->charbuf;
4233   int *charbuf_end = charbuf + coding->charbuf_used;
4234   unsigned char *dst = coding->destination + coding->produced;
4235   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4236   int safe_room = 4;
4237   int produced_chars = 0;
4238   Lisp_Object attrs, charset_list, val;
4239   int ascii_compatible;
4240   struct charset *charset_roman, *charset_kanji, *charset_kana;
4241   struct charset *charset_kanji2;
4242   int c;
4243
4244   CODING_GET_INFO (coding, attrs, charset_list);
4245   val = charset_list;
4246   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4247   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4248   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4249   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4250
4251   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4252
4253   while (charbuf < charbuf_end)
4254     {
4255       ASSURE_DESTINATION (safe_room);
4256       c = *charbuf++;
4257       /* Now encode the character C.  */
4258       if (ASCII_CHAR_P (c) && ascii_compatible)
4259         EMIT_ONE_ASCII_BYTE (c);
4260       else if (CHAR_BYTE8_P (c))
4261         {
4262           c = CHAR_TO_BYTE8 (c);
4263           EMIT_ONE_BYTE (c);
4264         }
4265       else
4266         {
4267           unsigned code;
4268           struct charset *charset = char_charset (c, charset_list, &code);
4269
4270           if (!charset)
4271             {
4272               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4273                 {
4274                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4275                   charset = CHARSET_FROM_ID (charset_ascii);
4276                 }
4277               else
4278                 {
4279                   c = coding->default_char;
4280                   charset = char_charset (c, charset_list, &code);
4281                 }
4282             }
4283           if (code == CHARSET_INVALID_CODE (charset))
4284             abort ();
4285           if (charset == charset_kanji)
4286             {
4287               int c1, c2;
4288               JIS_TO_SJIS (code);
4289               c1 = code >> 8, c2 = code & 0xFF;
4290               EMIT_TWO_BYTES (c1, c2);
4291             }
4292           else if (charset == charset_kana)
4293             EMIT_ONE_BYTE (code | 0x80);
4294           else if (charset_kanji2 && charset == charset_kanji2)
4295             {
4296               int c1, c2;
4297
4298               c1 = code >> 8;
4299               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4300                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4301                 {
4302                   JIS_TO_SJIS2 (code);
4303                   c1 = code >> 8, c2 = code & 0xFF;
4304                   EMIT_TWO_BYTES (c1, c2);
4305                 }
4306               else
4307                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4308             }
4309           else
4310             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4311         }
4312     }
4313   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4314   coding->produced_char += produced_chars;
4315   coding->produced = dst - coding->destination;
4316   return 0;
4317 }
4318
4319 static int
4320 encode_coding_big5 (coding)
4321      struct coding_system *coding;
4322 {
4323   int multibytep = coding->dst_multibyte;
4324   int *charbuf = coding->charbuf;
4325   int *charbuf_end = charbuf + coding->charbuf_used;
4326   unsigned char *dst = coding->destination + coding->produced;
4327   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4328   int safe_room = 4;
4329   int produced_chars = 0;
4330   Lisp_Object attrs, charset_list, val;
4331   int ascii_compatible;
4332   struct charset *charset_roman, *charset_big5;
4333   int c;
4334
4335   CODING_GET_INFO (coding, attrs, charset_list);
4336   val = charset_list;
4337   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4338   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4339   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4340
4341   while (charbuf < charbuf_end)
4342     {
4343       ASSURE_DESTINATION (safe_room);
4344       c = *charbuf++;
4345       /* Now encode the character C.  */
4346       if (ASCII_CHAR_P (c) && ascii_compatible)
4347         EMIT_ONE_ASCII_BYTE (c);
4348       else if (CHAR_BYTE8_P (c))
4349         {
4350           c = CHAR_TO_BYTE8 (c);
4351           EMIT_ONE_BYTE (c);
4352         }
4353       else
4354         {
4355           unsigned code;
4356           struct charset *charset = char_charset (c, charset_list, &code);
4357
4358           if (! charset)
4359             {
4360               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4361                 {
4362                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4363                   charset = CHARSET_FROM_ID (charset_ascii);
4364                 }
4365               else
4366                 {
4367                   c = coding->default_char;
4368                   charset = char_charset (c, charset_list, &code);
4369                 }
4370             }
4371           if (code == CHARSET_INVALID_CODE (charset))
4372             abort ();
4373           if (charset == charset_big5)
4374             {
4375               int c1, c2;
4376
4377               c1 = code >> 8, c2 = code & 0xFF;
4378               EMIT_TWO_BYTES (c1, c2);
4379             }
4380           else
4381             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4382         }
4383     }
4384   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4385   coding->produced_char += produced_chars;
4386   coding->produced = dst - coding->destination;
4387   return 0;
4388 }
4389
4390 \f
4391 /*** 10. CCL handlers ***/
4392
4393 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4394    Check if a text is encoded in a coding system of which
4395    encoder/decoder are written in CCL program.  If it is, return
4396    CATEGORY_MASK_CCL, else return 0.  */
4397
4398 static int
4399 detect_coding_ccl (coding, detect_info)
4400      struct coding_system *coding;
4401      struct coding_detection_info *detect_info;
4402 {
4403   const unsigned char *src = coding->source, *src_base;
4404   const unsigned char *src_end = coding->source + coding->src_bytes;
4405   int multibytep = coding->src_multibyte;
4406   int consumed_chars = 0;
4407   int found = 0;
4408   unsigned char *valids;
4409   int head_ascii = coding->head_ascii;
4410   Lisp_Object attrs;
4411
4412   detect_info->checked |= CATEGORY_MASK_CCL;
4413
4414   coding = &coding_categories[coding_category_ccl];
4415   valids = CODING_CCL_VALIDS (coding);
4416   attrs = CODING_ID_ATTRS (coding->id);
4417   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4418     src += head_ascii;
4419
4420   while (1)
4421     {
4422       int c;
4423
4424       src_base = src;
4425       ONE_MORE_BYTE (c);
4426       if (c < 0 || ! valids[c])
4427         break;
4428       if ((valids[c] > 1))
4429         found = CATEGORY_MASK_CCL;
4430     }
4431   detect_info->rejected |= CATEGORY_MASK_CCL;
4432   return 0;
4433
4434  no_more_source:
4435   detect_info->found |= found;
4436   return 1;
4437 }
4438
4439 static void
4440 decode_coding_ccl (coding)
4441      struct coding_system *coding;
4442 {
4443   const unsigned char *src = coding->source + coding->consumed;
4444   const unsigned char *src_end = coding->source + coding->src_bytes;
4445   int *charbuf = coding->charbuf + coding->charbuf_used;
4446   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4447   int consumed_chars = 0;
4448   int multibytep = coding->src_multibyte;
4449   struct ccl_program ccl;
4450   int source_charbuf[1024];
4451   int source_byteidx[1024];
4452   Lisp_Object attrs, charset_list;
4453
4454   CODING_GET_INFO (coding, attrs, charset_list);
4455   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4456
4457   while (src < src_end)
4458     {
4459       const unsigned char *p = src;
4460       int *source, *source_end;
4461       int i = 0;
4462
4463       if (multibytep)
4464         while (i < 1024 && p < src_end)
4465           {
4466             source_byteidx[i] = p - src;
4467             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4468           }
4469       else
4470         while (i < 1024 && p < src_end)
4471           source_charbuf[i++] = *p++;
4472
4473       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4474         ccl.last_block = 1;
4475
4476       source = source_charbuf;
4477       source_end = source + i;
4478       while (source < source_end)
4479         {
4480           ccl_driver (&ccl, source, charbuf,
4481                       source_end - source, charbuf_end - charbuf,
4482                       charset_list);
4483           source += ccl.consumed;
4484           charbuf += ccl.produced;
4485           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4486             break;
4487         }
4488       if (source < source_end)
4489         src += source_byteidx[source - source_charbuf];
4490       else
4491         src = p;
4492       consumed_chars += source - source_charbuf;
4493
4494       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4495           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4496         break;
4497     }
4498
4499   switch (ccl.status)
4500     {
4501     case CCL_STAT_SUSPEND_BY_SRC:
4502       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4503       break;
4504     case CCL_STAT_SUSPEND_BY_DST:
4505       break;
4506     case CCL_STAT_QUIT:
4507     case CCL_STAT_INVALID_CMD:
4508       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4509       break;
4510     default:
4511       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4512       break;
4513     }
4514   coding->consumed_char += consumed_chars;
4515   coding->consumed = src - coding->source;
4516   coding->charbuf_used = charbuf - coding->charbuf;
4517 }
4518
4519 static int
4520 encode_coding_ccl (coding)
4521      struct coding_system *coding;
4522 {
4523   struct ccl_program ccl;
4524   int multibytep = coding->dst_multibyte;
4525   int *charbuf = coding->charbuf;
4526   int *charbuf_end = charbuf + coding->charbuf_used;
4527   unsigned char *dst = coding->destination + coding->produced;
4528   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4529   unsigned char *adjusted_dst_end = dst_end - 1;
4530   int destination_charbuf[1024];
4531   int i, produced_chars = 0;
4532   Lisp_Object attrs, charset_list;
4533
4534   CODING_GET_INFO (coding, attrs, charset_list);
4535   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4536
4537   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4538   ccl.dst_multibyte = coding->dst_multibyte;
4539
4540   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4541     {
4542       int dst_bytes = dst_end - dst;
4543       if (dst_bytes > 1024)
4544         dst_bytes = 1024;
4545
4546       ccl_driver (&ccl, charbuf, destination_charbuf,
4547                   charbuf_end - charbuf, dst_bytes, charset_list);
4548       charbuf += ccl.consumed;
4549       if (multibytep)
4550         for (i = 0; i < ccl.produced; i++)
4551           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4552       else
4553         {
4554           for (i = 0; i < ccl.produced; i++)
4555             *dst++ = destination_charbuf[i] & 0xFF;
4556           produced_chars += ccl.produced;
4557         }
4558     }
4559
4560   switch (ccl.status)
4561     {
4562     case CCL_STAT_SUSPEND_BY_SRC:
4563       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4564       break;
4565     case CCL_STAT_SUSPEND_BY_DST:
4566       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4567       break;
4568     case CCL_STAT_QUIT:
4569     case CCL_STAT_INVALID_CMD:
4570       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4571       break;
4572     default:
4573       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4574       break;
4575     }
4576
4577   coding->produced_char += produced_chars;
4578   coding->produced = dst - coding->destination;
4579   return 0;
4580 }
4581
4582
4583 \f
4584 /*** 10, 11. no-conversion handlers ***/
4585
4586 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4587
4588 static void
4589 decode_coding_raw_text (coding)
4590      struct coding_system *coding;
4591 {
4592   coding->chars_at_source = 1;
4593   coding->consumed_char = 0;
4594   coding->consumed = 0;
4595   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4596 }
4597
4598 static int
4599 encode_coding_raw_text (coding)
4600      struct coding_system *coding;
4601 {
4602   int multibytep = coding->dst_multibyte;
4603   int *charbuf = coding->charbuf;
4604   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4605   unsigned char *dst = coding->destination + coding->produced;
4606   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4607   int produced_chars = 0;
4608   int c;
4609
4610   if (multibytep)
4611     {
4612       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4613
4614       if (coding->src_multibyte)
4615         while (charbuf < charbuf_end)
4616           {
4617             ASSURE_DESTINATION (safe_room);
4618             c = *charbuf++;
4619             if (ASCII_CHAR_P (c))
4620               EMIT_ONE_ASCII_BYTE (c);
4621             else if (CHAR_BYTE8_P (c))
4622               {
4623                 c = CHAR_TO_BYTE8 (c);
4624                 EMIT_ONE_BYTE (c);
4625               }
4626             else
4627               {
4628                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4629
4630                 CHAR_STRING_ADVANCE (c, p1);
4631                 while (p0 < p1)
4632                   {
4633                     EMIT_ONE_BYTE (*p0);
4634                     p0++;
4635                   }
4636               }
4637           }
4638       else
4639         while (charbuf < charbuf_end)
4640           {
4641             ASSURE_DESTINATION (safe_room);
4642             c = *charbuf++;
4643             EMIT_ONE_BYTE (c);
4644           }
4645     }
4646   else
4647     {
4648       if (coding->src_multibyte)
4649         {
4650           int safe_room = MAX_MULTIBYTE_LENGTH;
4651
4652           while (charbuf < charbuf_end)
4653             {
4654               ASSURE_DESTINATION (safe_room);
4655               c = *charbuf++;
4656               if (ASCII_CHAR_P (c))
4657                 *dst++ = c;
4658               else if (CHAR_BYTE8_P (c))
4659                 *dst++ = CHAR_TO_BYTE8 (c);
4660               else
4661                 CHAR_STRING_ADVANCE (c, dst);
4662               produced_chars++;
4663             }
4664         }
4665       else
4666         {
4667           ASSURE_DESTINATION (charbuf_end - charbuf);
4668           while (charbuf < charbuf_end && dst < dst_end)
4669             *dst++ = *charbuf++;
4670           produced_chars = dst - (coding->destination + coding->dst_bytes);
4671         }
4672     }
4673   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4674   coding->produced_char += produced_chars;
4675   coding->produced = dst - coding->destination;
4676   return 0;
4677 }
4678
4679 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4680    Check if a text is encoded in a charset-based coding system.  If it
4681    is, return 1, else return 0.  */
4682
4683 static int
4684 detect_coding_charset (coding, detect_info)
4685      struct coding_system *coding;
4686      struct coding_detection_info *detect_info;
4687 {
4688   const unsigned char *src = coding->source, *src_base;
4689   const unsigned char *src_end = coding->source + coding->src_bytes;
4690   int multibytep = coding->src_multibyte;
4691   int consumed_chars = 0;
4692   Lisp_Object attrs, valids;
4693   int found = 0;
4694
4695   detect_info->checked |= CATEGORY_MASK_CHARSET;
4696
4697   coding = &coding_categories[coding_category_charset];
4698   attrs = CODING_ID_ATTRS (coding->id);
4699   valids = AREF (attrs, coding_attr_charset_valids);
4700
4701   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4702     src += coding->head_ascii;
4703
4704   while (1)
4705     {
4706       int c;
4707
4708       src_base = src;
4709       ONE_MORE_BYTE (c);
4710       if (c < 0)
4711         continue;
4712       if (NILP (AREF (valids, c)))
4713         break;
4714       if (c >= 0x80)
4715         found = CATEGORY_MASK_CHARSET;
4716     }
4717   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4718   return 0;
4719
4720  no_more_source:
4721   detect_info->found |= found;
4722   return 1;
4723 }
4724
4725 static void
4726 decode_coding_charset (coding)
4727      struct coding_system *coding;
4728 {
4729   const unsigned char *src = coding->source + coding->consumed;
4730   const unsigned char *src_end = coding->source + coding->src_bytes;
4731   const unsigned char *src_base;
4732   int *charbuf = coding->charbuf + coding->charbuf_used;
4733   int *charbuf_end
4734     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4735   int consumed_chars = 0, consumed_chars_base;
4736   int multibytep = coding->src_multibyte;
4737   Lisp_Object attrs, charset_list, valids;
4738   int char_offset = coding->produced_char;
4739   int last_offset = char_offset;
4740   int last_id = charset_ascii;
4741
4742   CODING_GET_INFO (coding, attrs, charset_list);
4743   valids = AREF (attrs, coding_attr_charset_valids);
4744
4745   while (1)
4746     {
4747       int c;
4748       Lisp_Object val;
4749       struct charset *charset;
4750       int dim;
4751       int len = 1;
4752       unsigned code;
4753
4754       src_base = src;
4755       consumed_chars_base = consumed_chars;
4756
4757       if (charbuf >= charbuf_end)
4758         break;
4759
4760       ONE_MORE_BYTE (c);
4761       if (c < 0)
4762         goto invalid_code;
4763       code = c;
4764
4765       val = AREF (valids, c);
4766       if (NILP (val))
4767         goto invalid_code;
4768       if (INTEGERP (val))
4769         {
4770           charset = CHARSET_FROM_ID (XFASTINT (val));
4771           dim = CHARSET_DIMENSION (charset);
4772           while (len < dim)
4773             {
4774               ONE_MORE_BYTE (c);
4775               code = (code << 8) | c;
4776               len++;
4777             }
4778           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4779                               charset, code, c);
4780         }
4781       else
4782         {
4783           /* VAL is a list of charset IDs.  It is assured that the
4784              list is sorted by charset dimensions (smaller one
4785              comes first).  */
4786           while (CONSP (val))
4787             {
4788               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4789               dim = CHARSET_DIMENSION (charset);
4790               while (len < dim)
4791                 {
4792                   ONE_MORE_BYTE (c);
4793                   code = (code << 8) | c;
4794                   len++;
4795                 }
4796               CODING_DECODE_CHAR (coding, src, src_base,
4797                                   src_end, charset, code, c);
4798               if (c >= 0)
4799                 break;
4800               val = XCDR (val);
4801             }
4802         }
4803       if (c < 0)
4804         goto invalid_code;
4805       if (charset->id != charset_ascii
4806           && last_id != charset->id)
4807         {
4808           if (last_id != charset_ascii)
4809             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4810           last_id = charset->id;
4811           last_offset = char_offset;
4812         }
4813
4814       *charbuf++ = c;
4815       char_offset++;
4816       continue;
4817
4818     invalid_code:
4819       src = src_base;
4820       consumed_chars = consumed_chars_base;
4821       ONE_MORE_BYTE (c);
4822       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4823       char_offset++;
4824       coding->errors++;
4825     }
4826
4827  no_more_source:
4828   if (last_id != charset_ascii)
4829     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4830   coding->consumed_char += consumed_chars_base;
4831   coding->consumed = src_base - coding->source;
4832   coding->charbuf_used = charbuf - coding->charbuf;
4833 }
4834
4835 static int
4836 encode_coding_charset (coding)
4837      struct coding_system *coding;
4838 {
4839   int multibytep = coding->dst_multibyte;
4840   int *charbuf = coding->charbuf;
4841   int *charbuf_end = charbuf + coding->charbuf_used;
4842   unsigned char *dst = coding->destination + coding->produced;
4843   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4844   int safe_room = MAX_MULTIBYTE_LENGTH;
4845   int produced_chars = 0;
4846   Lisp_Object attrs, charset_list;
4847   int ascii_compatible;
4848   int c;
4849
4850   CODING_GET_INFO (coding, attrs, charset_list);
4851   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4852
4853   while (charbuf < charbuf_end)
4854     {
4855       struct charset *charset;
4856       unsigned code;
4857
4858       ASSURE_DESTINATION (safe_room);
4859       c = *charbuf++;
4860       if (ascii_compatible && ASCII_CHAR_P (c))
4861         EMIT_ONE_ASCII_BYTE (c);
4862       else if (CHAR_BYTE8_P (c))
4863         {
4864           c = CHAR_TO_BYTE8 (c);
4865           EMIT_ONE_BYTE (c);
4866         }
4867       else
4868         {
4869           charset = char_charset (c, charset_list, &code);
4870           if (charset)
4871             {
4872               if (CHARSET_DIMENSION (charset) == 1)
4873                 EMIT_ONE_BYTE (code);
4874               else if (CHARSET_DIMENSION (charset) == 2)
4875                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4876               else if (CHARSET_DIMENSION (charset) == 3)
4877                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4878               else
4879                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4880                                  (code >> 8) & 0xFF, code & 0xFF);
4881             }
4882           else
4883             {
4884               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4885                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4886               else
4887                 c = coding->default_char;
4888               EMIT_ONE_BYTE (c);
4889             }
4890         }
4891     }
4892
4893   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4894   coding->produced_char += produced_chars;
4895   coding->produced = dst - coding->destination;
4896   return 0;
4897 }
4898
4899 \f
4900 /*** 7. C library functions ***/
4901
4902 /* Setup coding context CODING from information about CODING_SYSTEM.
4903    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4904    CODING_SYSTEM is invalid, signal an error.  */
4905
4906 void
4907 setup_coding_system (coding_system, coding)
4908      Lisp_Object coding_system;
4909      struct coding_system *coding;
4910 {
4911   Lisp_Object attrs;
4912   Lisp_Object eol_type;
4913   Lisp_Object coding_type;
4914   Lisp_Object val;
4915
4916   if (NILP (coding_system))
4917     coding_system = Qno_conversion;
4918
4919   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4920
4921   attrs = CODING_ID_ATTRS (coding->id);
4922   eol_type = CODING_ID_EOL_TYPE (coding->id);
4923
4924   coding->mode = 0;
4925   coding->head_ascii = -1;
4926   coding->common_flags
4927     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4928   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4929     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4930   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4931     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4932   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4933     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4934
4935   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4936   coding->max_charset_id = SCHARS (val) - 1;
4937   coding->safe_charsets = (char *) SDATA (val);
4938   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4939
4940   coding_type = CODING_ATTR_TYPE (attrs);
4941   if (EQ (coding_type, Qundecided))
4942     {
4943       coding->detector = NULL;
4944       coding->decoder = decode_coding_raw_text;
4945       coding->encoder = encode_coding_raw_text;
4946       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4947     }
4948   else if (EQ (coding_type, Qiso_2022))
4949     {
4950       int i;
4951       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4952
4953       /* Invoke graphic register 0 to plane 0.  */
4954       CODING_ISO_INVOCATION (coding, 0) = 0;
4955       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4956       CODING_ISO_INVOCATION (coding, 1)
4957         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4958       /* Setup the initial status of designation.  */
4959       for (i = 0; i < 4; i++)
4960         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4961       /* Not single shifting initially.  */
4962       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4963       /* Beginning of buffer should also be regarded as bol. */
4964       CODING_ISO_BOL (coding) = 1;
4965       coding->detector = detect_coding_iso_2022;
4966       coding->decoder = decode_coding_iso_2022;
4967       coding->encoder = encode_coding_iso_2022;
4968       if (flags & CODING_ISO_FLAG_SAFE)
4969         coding->mode |= CODING_MODE_SAFE_ENCODING;
4970       coding->common_flags
4971         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4972             | CODING_REQUIRE_FLUSHING_MASK);
4973       if (flags & CODING_ISO_FLAG_COMPOSITION)
4974         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4975       if (flags & CODING_ISO_FLAG_DESIGNATION)
4976         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4977       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4978         {
4979           setup_iso_safe_charsets (attrs);
4980           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4981           coding->max_charset_id = SCHARS (val) - 1;
4982           coding->safe_charsets = (char *) SDATA (val);
4983         }
4984       CODING_ISO_FLAGS (coding) = flags;
4985     }
4986   else if (EQ (coding_type, Qcharset))
4987     {
4988       coding->detector = detect_coding_charset;
4989       coding->decoder = decode_coding_charset;
4990       coding->encoder = encode_coding_charset;
4991       coding->common_flags
4992         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4993     }
4994   else if (EQ (coding_type, Qutf_8))
4995     {
4996       coding->detector = detect_coding_utf_8;
4997       coding->decoder = decode_coding_utf_8;
4998       coding->encoder = encode_coding_utf_8;
4999       coding->common_flags
5000         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5001     }
5002   else if (EQ (coding_type, Qutf_16))
5003     {
5004       val = AREF (attrs, coding_attr_utf_16_bom);
5005       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5006                                     : EQ (val, Qt) ? utf_16_with_bom
5007                                     : utf_16_without_bom);
5008       val = AREF (attrs, coding_attr_utf_16_endian);
5009       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5010                                        : utf_16_little_endian);
5011       CODING_UTF_16_SURROGATE (coding) = 0;
5012       coding->detector = detect_coding_utf_16;
5013       coding->decoder = decode_coding_utf_16;
5014       coding->encoder = encode_coding_utf_16;
5015       coding->common_flags
5016         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5017       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5018         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5019     }
5020   else if (EQ (coding_type, Qccl))
5021     {
5022       coding->detector = detect_coding_ccl;
5023       coding->decoder = decode_coding_ccl;
5024       coding->encoder = encode_coding_ccl;
5025       coding->common_flags
5026         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5027             | CODING_REQUIRE_FLUSHING_MASK);
5028     }
5029   else if (EQ (coding_type, Qemacs_mule))
5030     {
5031       coding->detector = detect_coding_emacs_mule;
5032       coding->decoder = decode_coding_emacs_mule;
5033       coding->encoder = encode_coding_emacs_mule;
5034       coding->common_flags
5035         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5036       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5037           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5038         {
5039           Lisp_Object tail, safe_charsets;
5040           int max_charset_id = 0;
5041
5042           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5043                tail = XCDR (tail))
5044             if (max_charset_id < XFASTINT (XCAR (tail)))
5045               max_charset_id = XFASTINT (XCAR (tail));
5046           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5047                                         make_number (255));
5048           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5049                tail = XCDR (tail))
5050             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5051           coding->max_charset_id = max_charset_id;
5052           coding->safe_charsets = (char *) SDATA (safe_charsets);
5053         }
5054     }
5055   else if (EQ (coding_type, Qshift_jis))
5056     {
5057       coding->detector = detect_coding_sjis;
5058       coding->decoder = decode_coding_sjis;
5059       coding->encoder = encode_coding_sjis;
5060       coding->common_flags
5061         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5062     }
5063   else if (EQ (coding_type, Qbig5))
5064     {
5065       coding->detector = detect_coding_big5;
5066       coding->decoder = decode_coding_big5;
5067       coding->encoder = encode_coding_big5;
5068       coding->common_flags
5069         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5070     }
5071   else                          /* EQ (coding_type, Qraw_text) */
5072     {
5073       coding->detector = NULL;
5074       coding->decoder = decode_coding_raw_text;
5075       coding->encoder = encode_coding_raw_text;
5076       if (! EQ (eol_type, Qunix))
5077         {
5078           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5079           if (! VECTORP (eol_type))
5080             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5081         }
5082
5083     }
5084
5085   return;
5086 }
5087
5088 /* Return raw-text or one of its subsidiaries that has the same
5089    eol_type as CODING-SYSTEM.  */
5090
5091 Lisp_Object
5092 raw_text_coding_system (coding_system)
5093      Lisp_Object coding_system;
5094 {
5095   Lisp_Object spec, attrs;
5096   Lisp_Object eol_type, raw_text_eol_type;
5097
5098   if (NILP (coding_system))
5099     return Qraw_text;
5100   spec = CODING_SYSTEM_SPEC (coding_system);
5101   attrs = AREF (spec, 0);
5102
5103   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5104     return coding_system;
5105
5106   eol_type = AREF (spec, 2);
5107   if (VECTORP (eol_type))
5108     return Qraw_text;
5109   spec = CODING_SYSTEM_SPEC (Qraw_text);
5110   raw_text_eol_type = AREF (spec, 2);
5111   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5112           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5113           : AREF (raw_text_eol_type, 2));
5114 }
5115
5116
5117 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5118    does, return one of the subsidiary that has the same eol-spec as
5119    PARENT.  Otherwise, return CODING_SYSTEM.  */
5120
5121 Lisp_Object
5122 coding_inherit_eol_type (coding_system, parent)
5123      Lisp_Object coding_system, parent;
5124 {
5125   Lisp_Object spec, eol_type;
5126
5127   if (NILP (coding_system))
5128     coding_system = Qraw_text;
5129   spec = CODING_SYSTEM_SPEC (coding_system);
5130   eol_type = AREF (spec, 2);
5131   if (VECTORP (eol_type)
5132       && ! NILP (parent))
5133     {
5134       Lisp_Object parent_spec;
5135       Lisp_Object parent_eol_type;
5136
5137       parent_spec
5138         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5139       parent_eol_type = AREF (parent_spec, 2);
5140       if (EQ (parent_eol_type, Qunix))
5141         coding_system = AREF (eol_type, 0);
5142       else if (EQ (parent_eol_type, Qdos))
5143         coding_system = AREF (eol_type, 1);
5144       else if (EQ (parent_eol_type, Qmac))
5145         coding_system = AREF (eol_type, 2);
5146     }
5147   return coding_system;
5148 }
5149
5150 /* Emacs has a mechanism to automatically detect a coding system if it
5151    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5152    it's impossible to distinguish some coding systems accurately
5153    because they use the same range of codes.  So, at first, coding
5154    systems are categorized into 7, those are:
5155
5156    o coding-category-emacs-mule
5157
5158         The category for a coding system which has the same code range
5159         as Emacs' internal format.  Assigned the coding-system (Lisp
5160         symbol) `emacs-mule' by default.
5161
5162    o coding-category-sjis
5163
5164         The category for a coding system which has the same code range
5165         as SJIS.  Assigned the coding-system (Lisp
5166         symbol) `japanese-shift-jis' by default.
5167
5168    o coding-category-iso-7
5169
5170         The category for a coding system which has the same code range
5171         as ISO2022 of 7-bit environment.  This doesn't use any locking
5172         shift and single shift functions.  This can encode/decode all
5173         charsets.  Assigned the coding-system (Lisp symbol)
5174         `iso-2022-7bit' by default.
5175
5176    o coding-category-iso-7-tight
5177
5178         Same as coding-category-iso-7 except that this can
5179         encode/decode only the specified charsets.
5180
5181    o coding-category-iso-8-1
5182
5183         The category for a coding system which has the same code range
5184         as ISO2022 of 8-bit environment and graphic plane 1 used only
5185         for DIMENSION1 charset.  This doesn't use any locking shift
5186         and single shift functions.  Assigned the coding-system (Lisp
5187         symbol) `iso-latin-1' by default.
5188
5189    o coding-category-iso-8-2
5190
5191         The category for a coding system which has the same code range
5192         as ISO2022 of 8-bit environment and graphic plane 1 used only
5193         for DIMENSION2 charset.  This doesn't use any locking shift
5194         and single shift functions.  Assigned the coding-system (Lisp
5195         symbol) `japanese-iso-8bit' by default.
5196
5197    o coding-category-iso-7-else
5198
5199         The category for a coding system which has the same code range
5200         as ISO2022 of 7-bit environemnt but uses locking shift or
5201         single shift functions.  Assigned the coding-system (Lisp
5202         symbol) `iso-2022-7bit-lock' by default.
5203
5204    o coding-category-iso-8-else
5205
5206         The category for a coding system which has the same code range
5207         as ISO2022 of 8-bit environemnt but uses locking shift or
5208         single shift functions.  Assigned the coding-system (Lisp
5209         symbol) `iso-2022-8bit-ss2' by default.
5210
5211    o coding-category-big5
5212
5213         The category for a coding system which has the same code range
5214         as BIG5.  Assigned the coding-system (Lisp symbol)
5215         `cn-big5' by default.
5216
5217    o coding-category-utf-8
5218
5219         The category for a coding system which has the same code range
5220         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5221         symbol) `utf-8' by default.
5222
5223    o coding-category-utf-16-be
5224
5225         The category for a coding system in which a text has an
5226         Unicode signature (cf. Unicode Standard) in the order of BIG
5227         endian at the head.  Assigned the coding-system (Lisp symbol)
5228         `utf-16-be' by default.
5229
5230    o coding-category-utf-16-le
5231
5232         The category for a coding system in which a text has an
5233         Unicode signature (cf. Unicode Standard) in the order of
5234         LITTLE endian at the head.  Assigned the coding-system (Lisp
5235         symbol) `utf-16-le' by default.
5236
5237    o coding-category-ccl
5238
5239         The category for a coding system of which encoder/decoder is
5240         written in CCL programs.  The default value is nil, i.e., no
5241         coding system is assigned.
5242
5243    o coding-category-binary
5244
5245         The category for a coding system not categorized in any of the
5246         above.  Assigned the coding-system (Lisp symbol)
5247         `no-conversion' by default.
5248
5249    Each of them is a Lisp symbol and the value is an actual
5250    `coding-system's (this is also a Lisp symbol) assigned by a user.
5251    What Emacs does actually is to detect a category of coding system.
5252    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5253    decide only one possible category, it selects a category of the
5254    highest priority.  Priorities of categories are also specified by a
5255    user in a Lisp variable `coding-category-list'.
5256
5257 */
5258
5259 #define EOL_SEEN_NONE   0
5260 #define EOL_SEEN_LF     1
5261 #define EOL_SEEN_CR     2
5262 #define EOL_SEEN_CRLF   4
5263
5264 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5265    SOURCE is encoded.  If CATEGORY is one of
5266    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5267    two-byte, else they are encoded by one-byte.
5268
5269    Return one of EOL_SEEN_XXX.  */
5270
5271 #define MAX_EOL_CHECK_COUNT 3
5272
5273 static int
5274 detect_eol (source, src_bytes, category)
5275      const unsigned char *source;
5276      EMACS_INT src_bytes;
5277      enum coding_category category;
5278 {
5279   const unsigned char *src = source, *src_end = src + src_bytes;
5280   unsigned char c;
5281   int total  = 0;
5282   int eol_seen = EOL_SEEN_NONE;
5283
5284   if ((1 << category) & CATEGORY_MASK_UTF_16)
5285     {
5286       int msb, lsb;
5287
5288       msb = category == (coding_category_utf_16_le
5289                          | coding_category_utf_16_le_nosig);
5290       lsb = 1 - msb;
5291
5292       while (src + 1 < src_end)
5293         {
5294           c = src[lsb];
5295           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5296             {
5297               int this_eol;
5298
5299               if (c == '\n')
5300                 this_eol = EOL_SEEN_LF;
5301               else if (src + 3 >= src_end
5302                        || src[msb + 2] != 0
5303                        || src[lsb + 2] != '\n')
5304                 this_eol = EOL_SEEN_CR;
5305               else
5306                 this_eol = EOL_SEEN_CRLF;
5307
5308               if (eol_seen == EOL_SEEN_NONE)
5309                 /* This is the first end-of-line.  */
5310                 eol_seen = this_eol;
5311               else if (eol_seen != this_eol)
5312                 {
5313                   /* The found type is different from what found before.  */
5314                   eol_seen = EOL_SEEN_LF;
5315                   break;
5316                 }
5317               if (++total == MAX_EOL_CHECK_COUNT)
5318                 break;
5319             }
5320           src += 2;
5321         }
5322     }
5323   else
5324     {
5325       while (src < src_end)
5326         {
5327           c = *src++;
5328           if (c == '\n' || c == '\r')
5329             {
5330               int this_eol;
5331
5332               if (c == '\n')
5333                 this_eol = EOL_SEEN_LF;
5334               else if (src >= src_end || *src != '\n')
5335                 this_eol = EOL_SEEN_CR;
5336               else
5337                 this_eol = EOL_SEEN_CRLF, src++;
5338
5339               if (eol_seen == EOL_SEEN_NONE)
5340                 /* This is the first end-of-line.  */
5341                 eol_seen = this_eol;
5342               else if (eol_seen != this_eol)
5343                 {
5344                   /* The found type is different from what found before.  */
5345                   eol_seen = EOL_SEEN_LF;
5346                   break;
5347                 }
5348               if (++total == MAX_EOL_CHECK_COUNT)
5349                 break;
5350             }
5351         }
5352     }
5353   return eol_seen;
5354 }
5355
5356
5357 static Lisp_Object
5358 adjust_coding_eol_type (coding, eol_seen)
5359      struct coding_system *coding;
5360      int eol_seen;
5361 {
5362   Lisp_Object eol_type;
5363
5364   eol_type = CODING_ID_EOL_TYPE (coding->id);
5365   if (eol_seen & EOL_SEEN_LF)
5366     {
5367       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5368       eol_type = Qunix;
5369     }
5370   else if (eol_seen & EOL_SEEN_CRLF)
5371     {
5372       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5373       eol_type = Qdos;
5374     }
5375   else if (eol_seen & EOL_SEEN_CR)
5376     {
5377       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5378       eol_type = Qmac;
5379     }
5380   return eol_type;
5381 }
5382
5383 /* Detect how a text specified in CODING is encoded.  If a coding
5384    system is detected, update fields of CODING by the detected coding
5385    system.  */
5386
5387 void
5388 detect_coding (coding)
5389      struct coding_system *coding;
5390 {
5391   const unsigned char *src, *src_end;
5392
5393   coding->consumed = coding->consumed_char = 0;
5394   coding->produced = coding->produced_char = 0;
5395   coding_set_source (coding);
5396
5397   src_end = coding->source + coding->src_bytes;
5398
5399   /* If we have not yet decided the text encoding type, detect it
5400      now.  */
5401   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5402     {
5403       int c, i;
5404       struct coding_detection_info detect_info;
5405
5406       detect_info.checked = detect_info.found = detect_info.rejected = 0;
5407       for (i = 0, src = coding->source; src < src_end; i++, src++)
5408         {
5409           c = *src;
5410           if (c & 0x80)
5411             break;
5412           if (c < 0x20
5413               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
5414               && ! inhibit_iso_escape_detection
5415               && ! detect_info.checked)
5416             {
5417               coding->head_ascii = src - (coding->source + coding->consumed);
5418               if (detect_coding_iso_2022 (coding, &detect_info))
5419                 {
5420                   /* We have scanned the whole data.  */
5421                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
5422                     /* We didn't find an 8-bit code.  */
5423                     src = src_end;
5424                   break;
5425                 }
5426             }
5427         }
5428       coding->head_ascii = src - (coding->source + coding->consumed);
5429
5430       if (coding->head_ascii < coding->src_bytes
5431           || detect_info.found)
5432         {
5433           enum coding_category category;
5434           struct coding_system *this;
5435
5436           if (coding->head_ascii == coding->src_bytes)
5437             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
5438             for (i = 0; i < coding_category_raw_text; i++)
5439               {
5440                 category = coding_priorities[i];
5441                 this = coding_categories + category;
5442                 if (detect_info.found & (1 << category))
5443                   break;
5444               }
5445           else
5446             for (i = 0; i < coding_category_raw_text; i++)
5447               {
5448                 category = coding_priorities[i];
5449                 this = coding_categories + category;
5450                 if (this->id < 0)
5451                   {
5452                     /* No coding system of this category is defined.  */
5453                     detect_info.rejected |= (1 << category);
5454                   }
5455                 else if (category >= coding_category_raw_text)
5456                   continue;
5457                 else if (detect_info.checked & (1 << category))
5458                   {
5459                     if (detect_info.found & (1 << category))
5460                       break;
5461                   }
5462                 else if ((*(this->detector)) (coding, &detect_info)
5463                          && detect_info.found & (1 << category))
5464                   {
5465                     if (category == coding_category_utf_16_auto)
5466                       {
5467                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5468                           category = coding_category_utf_16_le;
5469                         else
5470                           category = coding_category_utf_16_be;
5471                       }
5472                     break;
5473                   }
5474               }
5475
5476           if (i < coding_category_raw_text)
5477             setup_coding_system (CODING_ID_NAME (this->id), coding);
5478           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5479             setup_coding_system (Qraw_text, coding);
5480           else if (detect_info.rejected)
5481             for (i = 0; i < coding_category_raw_text; i++)
5482               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5483                 {
5484                   this = coding_categories + coding_priorities[i];
5485                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5486                   break;
5487                 }
5488         }
5489     }
5490   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5491            == coding_category_utf_16_auto)
5492     {
5493       Lisp_Object coding_systems;
5494       struct coding_detection_info detect_info;
5495
5496       coding_systems
5497         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5498       detect_info.found = detect_info.rejected = 0;
5499       if (CONSP (coding_systems)
5500           && detect_coding_utf_16 (coding, &detect_info))
5501         {
5502           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5503             setup_coding_system (XCAR (coding_systems), coding);
5504           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5505             setup_coding_system (XCDR (coding_systems), coding);
5506         }
5507     }
5508 }
5509
5510
5511 static void
5512 decode_eol (coding)
5513      struct coding_system *coding;
5514 {
5515   Lisp_Object eol_type;
5516   unsigned char *p, *pbeg, *pend;
5517
5518   eol_type = CODING_ID_EOL_TYPE (coding->id);
5519   if (EQ (eol_type, Qunix))
5520     return;
5521
5522   if (NILP (coding->dst_object))
5523     pbeg = coding->destination;
5524   else
5525     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5526   pend = pbeg + coding->produced;
5527
5528   if (VECTORP (eol_type))
5529     {
5530       int eol_seen = EOL_SEEN_NONE;
5531
5532       for (p = pbeg; p < pend; p++)
5533         {
5534           if (*p == '\n')
5535             eol_seen |= EOL_SEEN_LF;
5536           else if (*p == '\r')
5537             {
5538               if (p + 1 < pend && *(p + 1) == '\n')
5539                 {
5540                   eol_seen |= EOL_SEEN_CRLF;
5541                   p++;
5542                 }
5543               else
5544                 eol_seen |= EOL_SEEN_CR;
5545             }
5546         }
5547       if (eol_seen != EOL_SEEN_NONE
5548           && eol_seen != EOL_SEEN_LF
5549           && eol_seen != EOL_SEEN_CRLF
5550           && eol_seen != EOL_SEEN_CR)
5551         eol_seen = EOL_SEEN_LF;
5552       if (eol_seen != EOL_SEEN_NONE)
5553         eol_type = adjust_coding_eol_type (coding, eol_seen);
5554     }
5555
5556   if (EQ (eol_type, Qmac))
5557     {
5558       for (p = pbeg; p < pend; p++)
5559         if (*p == '\r')
5560           *p = '\n';
5561     }
5562   else if (EQ (eol_type, Qdos))
5563     {
5564       int n = 0;
5565
5566       if (NILP (coding->dst_object))
5567         {
5568           for (p = pend - 2; p >= pbeg; p--)
5569             if (*p == '\r')
5570               {
5571                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5572                 n++;
5573               }
5574         }
5575       else
5576         {
5577           for (p = pend - 2; p >= pbeg; p--)
5578             if (*p == '\r')
5579               {
5580                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5581                 int pos = BYTE_TO_CHAR (pos_byte);
5582
5583                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5584                 n++;
5585               }
5586         }
5587       coding->produced -= n;
5588       coding->produced_char -= n;
5589     }
5590 }
5591
5592
5593 /* Return a translation table (or list of them) from coding system
5594    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5595    decoding (ENCODEP is zero). */
5596
5597 static Lisp_Object
5598 get_translation_table (attrs, encodep, max_lookup)
5599      Lisp_Object attrs;
5600      int encodep, *max_lookup;
5601 {
5602   Lisp_Object standard, translation_table;
5603   Lisp_Object val;
5604
5605   if (encodep)
5606     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5607       standard = Vstandard_translation_table_for_encode;
5608   else
5609     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5610       standard = Vstandard_translation_table_for_decode;
5611   if (NILP (translation_table))
5612     translation_table = standard;
5613   else
5614     {
5615       if (SYMBOLP (translation_table))
5616         translation_table = Fget (translation_table, Qtranslation_table);
5617       else if (CONSP (translation_table))
5618         {
5619           translation_table = Fcopy_sequence (translation_table);
5620           for (val = translation_table; CONSP (val); val = XCDR (val))
5621             if (SYMBOLP (XCAR (val)))
5622               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5623         }
5624       if (CHAR_TABLE_P (standard))
5625         {
5626           if (CONSP (translation_table))
5627             translation_table = nconc2 (translation_table,
5628                                         Fcons (standard, Qnil));
5629           else
5630             translation_table = Fcons (translation_table,
5631                                        Fcons (standard, Qnil));
5632         }
5633     }
5634
5635   if (max_lookup)
5636     {
5637       *max_lookup = 1;
5638       if (CHAR_TABLE_P (translation_table)
5639           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5640         {
5641           val = XCHAR_TABLE (translation_table)->extras[1];
5642           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5643             *max_lookup = XFASTINT (val);
5644         }
5645       else if (CONSP (translation_table))
5646         {
5647           Lisp_Object tail, val;
5648
5649           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5650             if (CHAR_TABLE_P (XCAR (tail))
5651                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5652               {
5653                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5654                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5655                   *max_lookup = XFASTINT (val);
5656               }
5657         }
5658     }
5659   return translation_table;
5660 }
5661
5662 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5663   do {                                                          \
5664     trans = Qnil;                                               \
5665     if (CHAR_TABLE_P (table))                                   \
5666       {                                                         \
5667         trans = CHAR_TABLE_REF (table, c);                      \
5668         if (CHARACTERP (trans))                                 \
5669           c = XFASTINT (trans), trans = Qnil;                   \
5670       }                                                         \
5671     else if (CONSP (table))                                     \
5672       {                                                         \
5673         Lisp_Object tail;                                       \
5674                                                                 \
5675         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5676           if (CHAR_TABLE_P (XCAR (tail)))                       \
5677             {                                                   \
5678               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5679               if (CHARACTERP (trans))                           \
5680                 c = XFASTINT (trans), trans = Qnil;             \
5681               else if (! NILP (trans))                          \
5682                 break;                                          \
5683             }                                                   \
5684       }                                                         \
5685   } while (0)
5686
5687
5688 static Lisp_Object
5689 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5690      Lisp_Object val;
5691      int *buf, *buf_end;
5692      int last_block;
5693      int *from_nchars, *to_nchars;
5694 {
5695   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5696      [TO-CHAR ...].  */
5697   if (CONSP (val))
5698     {
5699       Lisp_Object from, tail;
5700       int i, len;
5701
5702       for (tail = val; CONSP (tail); tail = XCDR (tail))
5703         {
5704           val = XCAR (tail);
5705           from = XCAR (val);
5706           len = ASIZE (from);
5707           for (i = 0; i < len; i++)
5708             {
5709               if (buf + i == buf_end)
5710                 {
5711                   if (! last_block)
5712                     return Qt;
5713                   break;
5714                 }
5715               if (XINT (AREF (from, i)) != buf[i])
5716                 break;
5717             }
5718           if (i == len)
5719             {
5720               val = XCDR (val);
5721               *from_nchars = len;
5722               break;
5723             }
5724         }
5725       if (! CONSP (tail))
5726         return Qnil;
5727     }
5728   if (VECTORP (val))
5729     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5730   else
5731     *buf = XINT (val);
5732   return val;
5733 }
5734
5735
5736 static int
5737 produce_chars (coding, translation_table, last_block)
5738      struct coding_system *coding;
5739      Lisp_Object translation_table;
5740      int last_block;
5741 {
5742   unsigned char *dst = coding->destination + coding->produced;
5743   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5744   int produced;
5745   int produced_chars = 0;
5746   int carryover = 0;
5747
5748   if (! coding->chars_at_source)
5749     {
5750       /* Characters are in coding->charbuf.  */
5751       int *buf = coding->charbuf;
5752       int *buf_end = buf + coding->charbuf_used;
5753
5754       if (BUFFERP (coding->src_object)
5755           && EQ (coding->src_object, coding->dst_object))
5756         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5757
5758       while (buf < buf_end)
5759         {
5760           int c = *buf, i;
5761
5762           if (c >= 0)
5763             {
5764               int from_nchars = 1, to_nchars = 1;
5765               Lisp_Object trans = Qnil;
5766
5767               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5768               if (! NILP (trans))
5769                 {
5770                   trans = get_translation (trans, buf, buf_end, last_block,
5771                                            &from_nchars, &to_nchars);
5772                   if (EQ (trans, Qt))
5773                     break;
5774                   c = *buf;
5775                 }
5776
5777               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5778                 {
5779                   dst = alloc_destination (coding,
5780                                            buf_end - buf
5781                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5782                                            dst);
5783                   dst_end = coding->destination + coding->dst_bytes;
5784                 }
5785
5786               for (i = 0; i < to_nchars; i++)
5787                 {
5788                   if (i > 0)
5789                     c = XINT (AREF (trans, i));
5790                   if (coding->dst_multibyte
5791                       || ! CHAR_BYTE8_P (c))
5792                     CHAR_STRING_ADVANCE (c, dst);
5793                   else
5794                     *dst++ = CHAR_TO_BYTE8 (c);
5795                 }
5796               produced_chars += to_nchars;
5797               *buf++ = to_nchars;
5798               while (--from_nchars > 0)
5799                 *buf++ = 0;
5800             }
5801           else
5802             /* This is an annotation datum.  (-C) is the length.  */
5803             buf += -c;
5804         }
5805       carryover = buf_end - buf;
5806     }
5807   else
5808     {
5809       const unsigned char *src = coding->source;
5810       const unsigned char *src_end = src + coding->src_bytes;
5811       Lisp_Object eol_type;
5812
5813       eol_type = CODING_ID_EOL_TYPE (coding->id);
5814
5815       if (coding->src_multibyte != coding->dst_multibyte)
5816         {
5817           if (coding->src_multibyte)
5818             {
5819               int multibytep = 1;
5820               int consumed_chars;
5821
5822               while (1)
5823                 {
5824                   const unsigned char *src_base = src;
5825                   int c;
5826
5827                   ONE_MORE_BYTE (c);
5828                   if (c == '\r')
5829                     {
5830                       if (EQ (eol_type, Qdos))
5831                         {
5832                           if (src == src_end)
5833                             {
5834                               record_conversion_result
5835                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5836                               goto no_more_source;
5837                             }
5838                           if (*src == '\n')
5839                             c = *src++;
5840                         }
5841                       else if (EQ (eol_type, Qmac))
5842                         c = '\n';
5843                     }
5844                   if (dst == dst_end)
5845                     {
5846                       coding->consumed = src - coding->source;
5847
5848                     if (EQ (coding->src_object, coding->dst_object))
5849                       dst_end = (unsigned char *) src;
5850                     if (dst == dst_end)
5851                       {
5852                         dst = alloc_destination (coding, src_end - src + 1,
5853                                                  dst);
5854                         dst_end = coding->destination + coding->dst_bytes;
5855                         coding_set_source (coding);
5856                         src = coding->source + coding->consumed;
5857                         src_end = coding->source + coding->src_bytes;
5858                       }
5859                     }
5860                   *dst++ = c;
5861                   produced_chars++;
5862                 }
5863             no_more_source:
5864               ;
5865             }
5866           else
5867             while (src < src_end)
5868               {
5869                 int multibytep = 1;
5870                 int c = *src++;
5871
5872                 if (c == '\r')
5873                   {
5874                     if (EQ (eol_type, Qdos))
5875                       {
5876                         if (src < src_end
5877                             && *src == '\n')
5878                           c = *src++;
5879                       }
5880                     else if (EQ (eol_type, Qmac))
5881                       c = '\n';
5882                   }
5883                 if (dst >= dst_end - 1)
5884                   {
5885                     coding->consumed = src - coding->source;
5886
5887                     if (EQ (coding->src_object, coding->dst_object))
5888                       dst_end = (unsigned char *) src;
5889                     if (dst >= dst_end - 1)
5890                       {
5891                         dst = alloc_destination (coding, src_end - src + 2,
5892                                                  dst);
5893                         dst_end = coding->destination + coding->dst_bytes;
5894                         coding_set_source (coding);
5895                         src = coding->source + coding->consumed;
5896                         src_end = coding->source + coding->src_bytes;
5897                       }
5898                   }
5899                 EMIT_ONE_BYTE (c);
5900               }
5901         }
5902       else
5903         {
5904           if (!EQ (coding->src_object, coding->dst_object))
5905             {
5906               int require = coding->src_bytes - coding->dst_bytes;
5907
5908               if (require > 0)
5909                 {
5910                   EMACS_INT offset = src - coding->source;
5911
5912                   dst = alloc_destination (coding, require, dst);
5913                   coding_set_source (coding);
5914                   src = coding->source + offset;
5915                   src_end = coding->source + coding->src_bytes;
5916                 }
5917             }
5918           produced_chars = coding->src_chars;
5919           while (src < src_end)
5920             {
5921               int c = *src++;
5922
5923               if (c == '\r')
5924                 {
5925                   if (EQ (eol_type, Qdos))
5926                     {
5927                       if (src < src_end
5928                           && *src == '\n')
5929                         c = *src++;
5930                       produced_chars--;
5931                     }
5932                   else if (EQ (eol_type, Qmac))
5933                     c = '\n';
5934                 }
5935               *dst++ = c;
5936             }
5937         }
5938       coding->consumed = coding->src_bytes;
5939       coding->consumed_char = coding->src_chars;
5940     }
5941
5942   produced = dst - (coding->destination + coding->produced);
5943   if (BUFFERP (coding->dst_object))
5944     insert_from_gap (produced_chars, produced);
5945   coding->produced += produced;
5946   coding->produced_char += produced_chars;
5947   return carryover;
5948 }
5949
5950 /* Compose text in CODING->object according to the annotation data at
5951    CHARBUF.  CHARBUF is an array:
5952      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5953  */
5954
5955 static INLINE void
5956 produce_composition (coding, charbuf, pos)
5957      struct coding_system *coding;
5958      int *charbuf;
5959      EMACS_INT pos;
5960 {
5961   int len;
5962   EMACS_INT to;
5963   enum composition_method method;
5964   Lisp_Object components;
5965
5966   len = -charbuf[0];
5967   to = pos + charbuf[2];
5968   if (to <= pos)
5969     return;
5970   method = (enum composition_method) (charbuf[3]);
5971
5972   if (method == COMPOSITION_RELATIVE)
5973     components = Qnil;
5974   else if (method >= COMPOSITION_WITH_RULE
5975            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
5976     {
5977       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5978       int i;
5979
5980       len -= 4;
5981       charbuf += 4;
5982       for (i = 0; i < len; i++)
5983         {
5984           args[i] = make_number (charbuf[i]);
5985           if (args[i] < 0)
5986             return;
5987         }
5988       components = (method == COMPOSITION_WITH_ALTCHARS
5989                     ? Fstring (len, args) : Fvector (len, args));
5990     }
5991   else
5992     return;
5993   compose_text (pos, to, components, Qnil, coding->dst_object);
5994 }
5995
5996
5997 /* Put `charset' property on text in CODING->object according to
5998    the annotation data at CHARBUF.  CHARBUF is an array:
5999      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
6000  */
6001
6002 static INLINE void
6003 produce_charset (coding, charbuf, pos)
6004      struct coding_system *coding;
6005      int *charbuf;
6006      EMACS_INT pos;
6007 {
6008   EMACS_INT from = pos - charbuf[2];
6009   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6010
6011   Fput_text_property (make_number (from), make_number (pos),
6012                       Qcharset, CHARSET_NAME (charset),
6013                       coding->dst_object);
6014 }
6015
6016
6017 #define CHARBUF_SIZE 0x4000
6018
6019 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6020   do {                                                                  \
6021     int size = CHARBUF_SIZE;;                                           \
6022                                                                         \
6023     coding->charbuf = NULL;                                             \
6024     while (size > 1024)                                                 \
6025       {                                                                 \
6026         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6027         if (coding->charbuf)                                            \
6028           break;                                                        \
6029         size >>= 1;                                                     \
6030       }                                                                 \
6031     if (! coding->charbuf)                                              \
6032       {                                                                 \
6033         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6034         return coding->result;                                          \
6035       }                                                                 \
6036     coding->charbuf_size = size;                                        \
6037   } while (0)
6038
6039
6040 static void
6041 produce_annotation (coding, pos)
6042      struct coding_system *coding;
6043      EMACS_INT pos;
6044 {
6045   int *charbuf = coding->charbuf;
6046   int *charbuf_end = charbuf + coding->charbuf_used;
6047
6048   if (NILP (coding->dst_object))
6049     return;
6050
6051   while (charbuf < charbuf_end)
6052     {
6053       if (*charbuf >= 0)
6054         pos += *charbuf++;
6055       else
6056         {
6057           int len = -*charbuf;
6058           switch (charbuf[1])
6059             {
6060             case CODING_ANNOTATE_COMPOSITION_MASK:
6061               produce_composition (coding, charbuf, pos);
6062               break;
6063             case CODING_ANNOTATE_CHARSET_MASK:
6064               produce_charset (coding, charbuf, pos);
6065               break;
6066             default:
6067               abort ();
6068             }
6069           charbuf += len;
6070         }
6071     }
6072 }
6073
6074 /* Decode the data at CODING->src_object into CODING->dst_object.
6075    CODING->src_object is a buffer, a string, or nil.
6076    CODING->dst_object is a buffer.
6077
6078    If CODING->src_object is a buffer, it must be the current buffer.
6079    In this case, if CODING->src_pos is positive, it is a position of
6080    the source text in the buffer, otherwise, the source text is in the
6081    gap area of the buffer, and CODING->src_pos specifies the offset of
6082    the text from GPT (which must be the same as PT).  If this is the
6083    same buffer as CODING->dst_object, CODING->src_pos must be
6084    negative.
6085
6086    If CODING->src_object is a string, CODING->src_pos in an index to
6087    that string.
6088
6089    If CODING->src_object is nil, CODING->source must already point to
6090    the non-relocatable memory area.  In this case, CODING->src_pos is
6091    an offset from CODING->source.
6092
6093    The decoded data is inserted at the current point of the buffer
6094    CODING->dst_object.
6095 */
6096
6097 static int
6098 decode_coding (coding)
6099      struct coding_system *coding;
6100 {
6101   Lisp_Object attrs;
6102   Lisp_Object undo_list;
6103   Lisp_Object translation_table;
6104   int carryover;
6105   int i;
6106
6107   if (BUFFERP (coding->src_object)
6108       && coding->src_pos > 0
6109       && coding->src_pos < GPT
6110       && coding->src_pos + coding->src_chars > GPT)
6111     move_gap_both (coding->src_pos, coding->src_pos_byte);
6112
6113   undo_list = Qt;
6114   if (BUFFERP (coding->dst_object))
6115     {
6116       if (current_buffer != XBUFFER (coding->dst_object))
6117         set_buffer_internal (XBUFFER (coding->dst_object));
6118       if (GPT != PT)
6119         move_gap_both (PT, PT_BYTE);
6120       undo_list = current_buffer->undo_list;
6121       current_buffer->undo_list = Qt;
6122     }
6123
6124   coding->consumed = coding->consumed_char = 0;
6125   coding->produced = coding->produced_char = 0;
6126   coding->chars_at_source = 0;
6127   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6128   coding->errors = 0;
6129
6130   ALLOC_CONVERSION_WORK_AREA (coding);
6131
6132   attrs = CODING_ID_ATTRS (coding->id);
6133   translation_table = get_translation_table (attrs, 0, NULL);
6134
6135   carryover = 0;
6136   do
6137     {
6138       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6139
6140       coding_set_source (coding);
6141       coding->annotated = 0;
6142       coding->charbuf_used = carryover;
6143       (*(coding->decoder)) (coding);
6144       coding_set_destination (coding);
6145       carryover = produce_chars (coding, translation_table, 0);
6146       if (coding->annotated)
6147         produce_annotation (coding, pos);
6148       for (i = 0; i < carryover; i++)
6149         coding->charbuf[i]
6150           = coding->charbuf[coding->charbuf_used - carryover + i];
6151     }
6152   while (coding->consumed < coding->src_bytes
6153          && ! coding->result);
6154
6155   if (carryover > 0)
6156     {
6157       coding_set_destination (coding);
6158       coding->charbuf_used = carryover;
6159       produce_chars (coding, translation_table, 1);
6160     }
6161
6162   coding->carryover_bytes = 0;
6163   if (coding->consumed < coding->src_bytes)
6164     {
6165       int nbytes = coding->src_bytes - coding->consumed;
6166       const unsigned char *src;
6167
6168       coding_set_source (coding);
6169       coding_set_destination (coding);
6170       src = coding->source + coding->consumed;
6171
6172       if (coding->mode & CODING_MODE_LAST_BLOCK)
6173         {
6174           /* Flush out unprocessed data as binary chars.  We are sure
6175              that the number of data is less than the size of
6176              coding->charbuf.  */
6177           coding->charbuf_used = 0;
6178           while (nbytes-- > 0)
6179             {
6180               int c = *src++;
6181
6182               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6183             }
6184           produce_chars (coding, Qnil, 1);
6185         }
6186       else
6187         {
6188           /* Record unprocessed bytes in coding->carryover.  We are
6189              sure that the number of data is less than the size of
6190              coding->carryover.  */
6191           unsigned char *p = coding->carryover;
6192
6193           coding->carryover_bytes = nbytes;
6194           while (nbytes-- > 0)
6195             *p++ = *src++;
6196         }
6197       coding->consumed = coding->src_bytes;
6198     }
6199
6200   if (BUFFERP (coding->dst_object))
6201     {
6202       current_buffer->undo_list = undo_list;
6203       record_insert (coding->dst_pos, coding->produced_char);
6204     }
6205   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6206     decode_eol (coding);
6207   return coding->result;
6208 }
6209
6210
6211 /* Extract an annotation datum from a composition starting at POS and
6212    ending before LIMIT of CODING->src_object (buffer or string), store
6213    the data in BUF, set *STOP to a starting position of the next
6214    composition (if any) or to LIMIT, and return the address of the
6215    next element of BUF.
6216
6217    If such an annotation is not found, set *STOP to a starting
6218    position of a composition after POS (if any) or to LIMIT, and
6219    return BUF.  */
6220
6221 static INLINE int *
6222 handle_composition_annotation (pos, limit, coding, buf, stop)
6223      EMACS_INT pos, limit;
6224      struct coding_system *coding;
6225      int *buf;
6226      EMACS_INT *stop;
6227 {
6228   EMACS_INT start, end;
6229   Lisp_Object prop;
6230
6231   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6232       || end > limit)
6233     *stop = limit;
6234   else if (start > pos)
6235     *stop = start;
6236   else
6237     {
6238       if (start == pos)
6239         {
6240           /* We found a composition.  Store the corresponding
6241              annotation data in BUF.  */
6242           int *head = buf;
6243           enum composition_method method = COMPOSITION_METHOD (prop);
6244           int nchars = COMPOSITION_LENGTH (prop);
6245
6246           ADD_COMPOSITION_DATA (buf, nchars, method);
6247           if (method != COMPOSITION_RELATIVE)
6248             {
6249               Lisp_Object components;
6250               int len, i, i_byte;
6251
6252               components = COMPOSITION_COMPONENTS (prop);
6253               if (VECTORP (components))
6254                 {
6255                   len = XVECTOR (components)->size;
6256                   for (i = 0; i < len; i++)
6257                     *buf++ = XINT (AREF (components, i));
6258                 }
6259               else if (STRINGP (components))
6260                 {
6261                   len = SCHARS (components);
6262                   i = i_byte = 0;
6263                   while (i < len)
6264                     {
6265                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6266                       buf++;
6267                     }
6268                 }
6269               else if (INTEGERP (components))
6270                 {
6271                   len = 1;
6272                   *buf++ = XINT (components);
6273                 }
6274               else if (CONSP (components))
6275                 {
6276                   for (len = 0; CONSP (components);
6277                        len++, components = XCDR (components))
6278                     *buf++ = XINT (XCAR (components));
6279                 }
6280               else
6281                 abort ();
6282               *head -= len;
6283             }
6284         }
6285
6286       if (find_composition (end, limit, &start, &end, &prop,
6287                             coding->src_object)
6288           && end <= limit)
6289         *stop = start;
6290       else
6291         *stop = limit;
6292     }
6293   return buf;
6294 }
6295
6296
6297 /* Extract an annotation datum from a text property `charset' at POS of
6298    CODING->src_object (buffer of string), store the data in BUF, set
6299    *STOP to the position where the value of `charset' property changes
6300    (limiting by LIMIT), and return the address of the next element of
6301    BUF.
6302
6303    If the property value is nil, set *STOP to the position where the
6304    property value is non-nil (limiting by LIMIT), and return BUF.  */
6305
6306 static INLINE int *
6307 handle_charset_annotation (pos, limit, coding, buf, stop)
6308      EMACS_INT pos, limit;
6309      struct coding_system *coding;
6310      int *buf;
6311      EMACS_INT *stop;
6312 {
6313   Lisp_Object val, next;
6314   int id;
6315
6316   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6317   if (! NILP (val) && CHARSETP (val))
6318     id = XINT (CHARSET_SYMBOL_ID (val));
6319   else
6320     id = -1;
6321   ADD_CHARSET_DATA (buf, 0, id);
6322   next = Fnext_single_property_change (make_number (pos), Qcharset,
6323                                        coding->src_object,
6324                                        make_number (limit));
6325   *stop = XINT (next);
6326   return buf;
6327 }
6328
6329
6330 static void
6331 consume_chars (coding, translation_table, max_lookup)
6332      struct coding_system *coding;
6333      Lisp_Object translation_table;
6334      int max_lookup;
6335 {
6336   int *buf = coding->charbuf;
6337   int *buf_end = coding->charbuf + coding->charbuf_size;
6338   const unsigned char *src = coding->source + coding->consumed;
6339   const unsigned char *src_end = coding->source + coding->src_bytes;
6340   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6341   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6342   int multibytep = coding->src_multibyte;
6343   Lisp_Object eol_type;
6344   int c;
6345   EMACS_INT stop, stop_composition, stop_charset;
6346   int *lookup_buf = NULL;
6347
6348   if (! NILP (translation_table))
6349     lookup_buf = alloca (sizeof (int) * max_lookup);
6350
6351   eol_type = CODING_ID_EOL_TYPE (coding->id);
6352   if (VECTORP (eol_type))
6353     eol_type = Qunix;
6354
6355   /* Note: composition handling is not yet implemented.  */
6356   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6357
6358   if (NILP (coding->src_object))
6359     stop = stop_composition = stop_charset = end_pos;
6360   else
6361     {
6362       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6363         stop = stop_composition = pos;
6364       else
6365         stop = stop_composition = end_pos;
6366       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6367         stop = stop_charset = pos;
6368       else
6369         stop_charset = end_pos;
6370     }
6371
6372   /* Compensate for CRLF and conversion.  */
6373   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6374   while (buf < buf_end)
6375     {
6376       Lisp_Object trans;
6377
6378       if (pos == stop)
6379         {
6380           if (pos == end_pos)
6381             break;
6382           if (pos == stop_composition)
6383             buf = handle_composition_annotation (pos, end_pos, coding,
6384                                                  buf, &stop_composition);
6385           if (pos == stop_charset)
6386             buf = handle_charset_annotation (pos, end_pos, coding,
6387                                              buf, &stop_charset);
6388           stop = (stop_composition < stop_charset
6389                   ? stop_composition : stop_charset);
6390         }
6391
6392       if (! multibytep)
6393         {
6394           EMACS_INT bytes;
6395
6396           if (coding->encoder == encode_coding_raw_text)
6397             c = *src++, pos++;
6398           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6399             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6400           else
6401             c = BYTE8_TO_CHAR (*src), src++, pos++;
6402         }
6403       else
6404         c = STRING_CHAR_ADVANCE (src), pos++;
6405       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6406         c = '\n';
6407       if (! EQ (eol_type, Qunix))
6408         {
6409           if (c == '\n')
6410             {
6411               if (EQ (eol_type, Qdos))
6412                 *buf++ = '\r';
6413               else
6414                 c = '\r';
6415             }
6416         }
6417
6418       trans = Qnil;
6419       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6420       if (NILP (trans))
6421         *buf++ = c;
6422       else
6423         {
6424           int from_nchars = 1, to_nchars = 1;
6425           int *lookup_buf_end;
6426           const unsigned char *p = src;
6427           int i;
6428
6429           lookup_buf[0] = c;
6430           for (i = 1; i < max_lookup && p < src_end; i++)
6431             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6432           lookup_buf_end = lookup_buf + i;
6433           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6434                                    &from_nchars, &to_nchars);
6435           if (EQ (trans, Qt)
6436               || buf + to_nchars > buf_end)
6437             break;
6438           *buf++ = *lookup_buf;
6439           for (i = 1; i < to_nchars; i++)
6440             *buf++ = XINT (AREF (trans, i));
6441           for (i = 1; i < from_nchars; i++, pos++)
6442             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6443         }
6444     }
6445
6446   coding->consumed = src - coding->source;
6447   coding->consumed_char = pos - coding->src_pos;
6448   coding->charbuf_used = buf - coding->charbuf;
6449   coding->chars_at_source = 0;
6450 }
6451
6452
6453 /* Encode the text at CODING->src_object into CODING->dst_object.
6454    CODING->src_object is a buffer or a string.
6455    CODING->dst_object is a buffer or nil.
6456
6457    If CODING->src_object is a buffer, it must be the current buffer.
6458    In this case, if CODING->src_pos is positive, it is a position of
6459    the source text in the buffer, otherwise. the source text is in the
6460    gap area of the buffer, and coding->src_pos specifies the offset of
6461    the text from GPT (which must be the same as PT).  If this is the
6462    same buffer as CODING->dst_object, CODING->src_pos must be
6463    negative and CODING should not have `pre-write-conversion'.
6464
6465    If CODING->src_object is a string, CODING should not have
6466    `pre-write-conversion'.
6467
6468    If CODING->dst_object is a buffer, the encoded data is inserted at
6469    the current point of that buffer.
6470
6471    If CODING->dst_object is nil, the encoded data is placed at the
6472    memory area specified by CODING->destination.  */
6473
6474 static int
6475 encode_coding (coding)
6476      struct coding_system *coding;
6477 {
6478   Lisp_Object attrs;
6479   Lisp_Object translation_table;
6480   int max_lookup;
6481
6482   attrs = CODING_ID_ATTRS (coding->id);
6483   if (coding->encoder == encode_coding_raw_text)
6484     translation_table = Qnil, max_lookup = 0;
6485   else
6486     translation_table = get_translation_table (attrs, 1, &max_lookup);
6487
6488   if (BUFFERP (coding->dst_object))
6489     {
6490       set_buffer_internal (XBUFFER (coding->dst_object));
6491       coding->dst_multibyte
6492         = ! NILP (current_buffer->enable_multibyte_characters);
6493     }
6494
6495   coding->consumed = coding->consumed_char = 0;
6496   coding->produced = coding->produced_char = 0;
6497   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6498   coding->errors = 0;
6499
6500   ALLOC_CONVERSION_WORK_AREA (coding);
6501
6502   do {
6503     coding_set_source (coding);
6504     consume_chars (coding, translation_table, max_lookup);
6505     coding_set_destination (coding);
6506     (*(coding->encoder)) (coding);
6507   } while (coding->consumed_char < coding->src_chars);
6508
6509   if (BUFFERP (coding->dst_object))
6510     insert_from_gap (coding->produced_char, coding->produced);
6511
6512   return (coding->result);
6513 }
6514
6515
6516 /* Name (or base name) of work buffer for code conversion.  */
6517 static Lisp_Object Vcode_conversion_workbuf_name;
6518
6519 /* A working buffer used by the top level conversion.  Once it is
6520    created, it is never destroyed.  It has the name
6521    Vcode_conversion_workbuf_name.  The other working buffers are
6522    destroyed after the use is finished, and their names are modified
6523    versions of Vcode_conversion_workbuf_name.  */
6524 static Lisp_Object Vcode_conversion_reused_workbuf;
6525
6526 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6527 static int reused_workbuf_in_use;
6528
6529
6530 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6531    multibyteness of returning buffer.  */
6532
6533 static Lisp_Object
6534 make_conversion_work_buffer (multibyte)
6535      int multibyte;
6536 {
6537   Lisp_Object name, workbuf;
6538   struct buffer *current;
6539
6540   if (reused_workbuf_in_use++)
6541     {
6542       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6543       workbuf = Fget_buffer_create (name);
6544     }
6545   else
6546     {
6547       name = Vcode_conversion_workbuf_name;
6548       workbuf = Fget_buffer_create (name);
6549       if (NILP (Vcode_conversion_reused_workbuf))
6550         Vcode_conversion_reused_workbuf = workbuf;
6551     }
6552   current = current_buffer;
6553   set_buffer_internal (XBUFFER (workbuf));
6554   Ferase_buffer ();
6555   current_buffer->undo_list = Qt;
6556   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6557   set_buffer_internal (current);
6558   return workbuf;
6559 }
6560
6561
6562 static Lisp_Object
6563 code_conversion_restore (arg)
6564      Lisp_Object arg;
6565 {
6566   Lisp_Object current, workbuf;
6567
6568   current = XCAR (arg);
6569   workbuf = XCDR (arg);
6570   if (! NILP (workbuf))
6571     {
6572       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6573         reused_workbuf_in_use = 0;
6574       else if (! NILP (Fbuffer_live_p (workbuf)))
6575         Fkill_buffer (workbuf);
6576     }
6577   set_buffer_internal (XBUFFER (current));
6578   return Qnil;
6579 }
6580
6581 Lisp_Object
6582 code_conversion_save (with_work_buf, multibyte)
6583      int with_work_buf, multibyte;
6584 {
6585   Lisp_Object workbuf = Qnil;
6586
6587   if (with_work_buf)
6588     workbuf = make_conversion_work_buffer (multibyte);
6589   record_unwind_protect (code_conversion_restore,
6590                          Fcons (Fcurrent_buffer (), workbuf));
6591   return workbuf;
6592 }
6593
6594 int
6595 decode_coding_gap (coding, chars, bytes)
6596      struct coding_system *coding;
6597      EMACS_INT chars, bytes;
6598 {
6599   int count = specpdl_ptr - specpdl;
6600   Lisp_Object attrs;
6601
6602   code_conversion_save (0, 0);
6603
6604   coding->src_object = Fcurrent_buffer ();
6605   coding->src_chars = chars;
6606   coding->src_bytes = bytes;
6607   coding->src_pos = -chars;
6608   coding->src_pos_byte = -bytes;
6609   coding->src_multibyte = chars < bytes;
6610   coding->dst_object = coding->src_object;
6611   coding->dst_pos = PT;
6612   coding->dst_pos_byte = PT_BYTE;
6613   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6614   coding->mode |= CODING_MODE_LAST_BLOCK;
6615
6616   if (CODING_REQUIRE_DETECTION (coding))
6617     detect_coding (coding);
6618
6619   decode_coding (coding);
6620
6621   attrs = CODING_ID_ATTRS (coding->id);
6622   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6623     {
6624       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6625       Lisp_Object val;
6626
6627       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6628       val = call1 (CODING_ATTR_POST_READ (attrs),
6629                    make_number (coding->produced_char));
6630       CHECK_NATNUM (val);
6631       coding->produced_char += Z - prev_Z;
6632       coding->produced += Z_BYTE - prev_Z_BYTE;
6633     }
6634
6635   unbind_to (count, Qnil);
6636   return coding->result;
6637 }
6638
6639 int
6640 encode_coding_gap (coding, chars, bytes)
6641      struct coding_system *coding;
6642      EMACS_INT chars, bytes;
6643 {
6644   int count = specpdl_ptr - specpdl;
6645
6646   code_conversion_save (0, 0);
6647
6648   coding->src_object = Fcurrent_buffer ();
6649   coding->src_chars = chars;
6650   coding->src_bytes = bytes;
6651   coding->src_pos = -chars;
6652   coding->src_pos_byte = -bytes;
6653   coding->src_multibyte = chars < bytes;
6654   coding->dst_object = coding->src_object;
6655   coding->dst_pos = PT;
6656   coding->dst_pos_byte = PT_BYTE;
6657
6658   encode_coding (coding);
6659
6660   unbind_to (count, Qnil);
6661   return coding->result;
6662 }
6663
6664
6665 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6666    SRC_OBJECT into DST_OBJECT by coding context CODING.
6667
6668    SRC_OBJECT is a buffer, a string, or Qnil.
6669
6670    If it is a buffer, the text is at point of the buffer.  FROM and TO
6671    are positions in the buffer.
6672
6673    If it is a string, the text is at the beginning of the string.
6674    FROM and TO are indices to the string.
6675
6676    If it is nil, the text is at coding->source.  FROM and TO are
6677    indices to coding->source.
6678
6679    DST_OBJECT is a buffer, Qt, or Qnil.
6680
6681    If it is a buffer, the decoded text is inserted at point of the
6682    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6683    is deleted.
6684
6685    If it is Qt, a string is made from the decoded text, and
6686    set in CODING->dst_object.
6687
6688    If it is Qnil, the decoded text is stored at CODING->destination.
6689    The caller must allocate CODING->dst_bytes bytes at
6690    CODING->destination by xmalloc.  If the decoded text is longer than
6691    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6692  */
6693
6694 void
6695 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6696                       dst_object)
6697      struct coding_system *coding;
6698      Lisp_Object src_object;
6699      EMACS_INT from, from_byte, to, to_byte;
6700      Lisp_Object dst_object;
6701 {
6702   int count = specpdl_ptr - specpdl;
6703   unsigned char *destination;
6704   EMACS_INT dst_bytes;
6705   EMACS_INT chars = to - from;
6706   EMACS_INT bytes = to_byte - from_byte;
6707   Lisp_Object attrs;
6708   Lisp_Object buffer;
6709   int saved_pt = -1, saved_pt_byte;
6710
6711   buffer = Fcurrent_buffer ();
6712
6713   if (NILP (dst_object))
6714     {
6715       destination = coding->destination;
6716       dst_bytes = coding->dst_bytes;
6717     }
6718
6719   coding->src_object = src_object;
6720   coding->src_chars = chars;
6721   coding->src_bytes = bytes;
6722   coding->src_multibyte = chars < bytes;
6723
6724   if (STRINGP (src_object))
6725     {
6726       coding->src_pos = from;
6727       coding->src_pos_byte = from_byte;
6728     }
6729   else if (BUFFERP (src_object))
6730     {
6731       set_buffer_internal (XBUFFER (src_object));
6732       if (from != GPT)
6733         move_gap_both (from, from_byte);
6734       if (EQ (src_object, dst_object))
6735         {
6736           saved_pt = PT, saved_pt_byte = PT_BYTE;
6737           TEMP_SET_PT_BOTH (from, from_byte);
6738           del_range_both (from, from_byte, to, to_byte, 1);
6739           coding->src_pos = -chars;
6740           coding->src_pos_byte = -bytes;
6741         }
6742       else
6743         {
6744           coding->src_pos = from;
6745           coding->src_pos_byte = from_byte;
6746         }
6747     }
6748
6749   if (CODING_REQUIRE_DETECTION (coding))
6750     detect_coding (coding);
6751   attrs = CODING_ID_ATTRS (coding->id);
6752
6753   if (EQ (dst_object, Qt)
6754       || (! NILP (CODING_ATTR_POST_READ (attrs))
6755           && NILP (dst_object)))
6756     {
6757       coding->dst_object = code_conversion_save (1, 1);
6758       coding->dst_pos = BEG;
6759       coding->dst_pos_byte = BEG_BYTE;
6760       coding->dst_multibyte = 1;
6761     }
6762   else if (BUFFERP (dst_object))
6763     {
6764       code_conversion_save (0, 0);
6765       coding->dst_object = dst_object;
6766       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6767       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6768       coding->dst_multibyte
6769         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6770     }
6771   else
6772     {
6773       code_conversion_save (0, 0);
6774       coding->dst_object = Qnil;
6775       coding->dst_multibyte = 1;
6776     }
6777
6778   decode_coding (coding);
6779
6780   if (BUFFERP (coding->dst_object))
6781     set_buffer_internal (XBUFFER (coding->dst_object));
6782
6783   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6784     {
6785       struct gcpro gcpro1, gcpro2;
6786       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6787       Lisp_Object val;
6788
6789       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6790       GCPRO2 (coding->src_object, coding->dst_object);
6791       val = call1 (CODING_ATTR_POST_READ (attrs),
6792                    make_number (coding->produced_char));
6793       UNGCPRO;
6794       CHECK_NATNUM (val);
6795       coding->produced_char += Z - prev_Z;
6796       coding->produced += Z_BYTE - prev_Z_BYTE;
6797     }
6798
6799   if (EQ (dst_object, Qt))
6800     {
6801       coding->dst_object = Fbuffer_string ();
6802     }
6803   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6804     {
6805       set_buffer_internal (XBUFFER (coding->dst_object));
6806       if (dst_bytes < coding->produced)
6807         {
6808           destination
6809             = (unsigned char *) xrealloc (destination, coding->produced);
6810           if (! destination)
6811             {
6812               record_conversion_result (coding,
6813                                         CODING_RESULT_INSUFFICIENT_DST);
6814               unbind_to (count, Qnil);
6815               return;
6816             }
6817           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6818             move_gap_both (BEGV, BEGV_BYTE);
6819           bcopy (BEGV_ADDR, destination, coding->produced);
6820           coding->destination = destination;
6821         }
6822     }
6823
6824   if (saved_pt >= 0)
6825     {
6826       /* This is the case of:
6827          (BUFFERP (src_object) && EQ (src_object, dst_object))
6828          As we have moved PT while replacing the original buffer
6829          contents, we must recover it now.  */
6830       set_buffer_internal (XBUFFER (src_object));
6831       if (saved_pt < from)
6832         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6833       else if (saved_pt < from + chars)
6834         TEMP_SET_PT_BOTH (from, from_byte);
6835       else if (! NILP (current_buffer->enable_multibyte_characters))
6836         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6837                           saved_pt_byte + (coding->produced - bytes));
6838       else
6839         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6840                           saved_pt_byte + (coding->produced - bytes));
6841     }
6842
6843   unbind_to (count, coding->dst_object);
6844 }
6845
6846
6847 void
6848 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6849                       dst_object)
6850      struct coding_system *coding;
6851      Lisp_Object src_object;
6852      EMACS_INT from, from_byte, to, to_byte;
6853      Lisp_Object dst_object;
6854 {
6855   int count = specpdl_ptr - specpdl;
6856   EMACS_INT chars = to - from;
6857   EMACS_INT bytes = to_byte - from_byte;
6858   Lisp_Object attrs;
6859   Lisp_Object buffer;
6860   int saved_pt = -1, saved_pt_byte;
6861
6862   buffer = Fcurrent_buffer ();
6863
6864   coding->src_object = src_object;
6865   coding->src_chars = chars;
6866   coding->src_bytes = bytes;
6867   coding->src_multibyte = chars < bytes;
6868
6869   attrs = CODING_ID_ATTRS (coding->id);
6870
6871   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6872     {
6873       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6874       set_buffer_internal (XBUFFER (coding->src_object));
6875       if (STRINGP (src_object))
6876         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6877       else if (BUFFERP (src_object))
6878         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6879       else
6880         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6881
6882       if (EQ (src_object, dst_object))
6883         {
6884           set_buffer_internal (XBUFFER (src_object));
6885           saved_pt = PT, saved_pt_byte = PT_BYTE;
6886           del_range_both (from, from_byte, to, to_byte, 1);
6887           set_buffer_internal (XBUFFER (coding->src_object));
6888         }
6889
6890       call2 (CODING_ATTR_PRE_WRITE (attrs),
6891              make_number (BEG), make_number (Z));
6892       coding->src_object = Fcurrent_buffer ();
6893       if (BEG != GPT)
6894         move_gap_both (BEG, BEG_BYTE);
6895       coding->src_chars = Z - BEG;
6896       coding->src_bytes = Z_BYTE - BEG_BYTE;
6897       coding->src_pos = BEG;
6898       coding->src_pos_byte = BEG_BYTE;
6899       coding->src_multibyte = Z < Z_BYTE;
6900     }
6901   else if (STRINGP (src_object))
6902     {
6903       code_conversion_save (0, 0);
6904       coding->src_pos = from;
6905       coding->src_pos_byte = from_byte;
6906     }
6907   else if (BUFFERP (src_object))
6908     {
6909       code_conversion_save (0, 0);
6910       set_buffer_internal (XBUFFER (src_object));
6911       if (EQ (src_object, dst_object))
6912         {
6913           saved_pt = PT, saved_pt_byte = PT_BYTE;
6914           coding->src_object = del_range_1 (from, to, 1, 1);
6915           coding->src_pos = 0;
6916           coding->src_pos_byte = 0;
6917         }
6918       else
6919         {
6920           if (from < GPT && to >= GPT)
6921             move_gap_both (from, from_byte);
6922           coding->src_pos = from;
6923           coding->src_pos_byte = from_byte;
6924         }
6925     }
6926   else
6927     code_conversion_save (0, 0);
6928
6929   if (BUFFERP (dst_object))
6930     {
6931       coding->dst_object = dst_object;
6932       if (EQ (src_object, dst_object))
6933         {
6934           coding->dst_pos = from;
6935           coding->dst_pos_byte = from_byte;
6936         }
6937       else
6938         {
6939           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6940           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6941         }
6942       coding->dst_multibyte
6943         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6944     }
6945   else if (EQ (dst_object, Qt))
6946     {
6947       coding->dst_object = Qnil;
6948       coding->dst_bytes = coding->src_chars;
6949       if (coding->dst_bytes == 0)
6950         coding->dst_bytes = 1;
6951       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6952       coding->dst_multibyte = 0;
6953     }
6954   else
6955     {
6956       coding->dst_object = Qnil;
6957       coding->dst_multibyte = 0;
6958     }
6959
6960   encode_coding (coding);
6961
6962   if (EQ (dst_object, Qt))
6963     {
6964       if (BUFFERP (coding->dst_object))
6965         coding->dst_object = Fbuffer_string ();
6966       else
6967         {
6968           coding->dst_object
6969             = make_unibyte_string ((char *) coding->destination,
6970                                    coding->produced);
6971           xfree (coding->destination);
6972         }
6973     }
6974
6975   if (saved_pt >= 0)
6976     {
6977       /* This is the case of:
6978          (BUFFERP (src_object) && EQ (src_object, dst_object))
6979          As we have moved PT while replacing the original buffer
6980          contents, we must recover it now.  */
6981       set_buffer_internal (XBUFFER (src_object));
6982       if (saved_pt < from)
6983         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6984       else if (saved_pt < from + chars)
6985         TEMP_SET_PT_BOTH (from, from_byte);
6986       else if (! NILP (current_buffer->enable_multibyte_characters))
6987         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6988                           saved_pt_byte + (coding->produced - bytes));
6989       else
6990         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6991                           saved_pt_byte + (coding->produced - bytes));
6992     }
6993
6994   unbind_to (count, Qnil);
6995 }
6996
6997
6998 Lisp_Object
6999 preferred_coding_system ()
7000 {
7001   int id = coding_categories[coding_priorities[0]].id;
7002
7003   return CODING_ID_NAME (id);
7004 }
7005
7006 \f
7007 #ifdef emacs
7008 /*** 8. Emacs Lisp library functions ***/
7009
7010 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7011        doc: /* Return t if OBJECT is nil or a coding-system.
7012 See the documentation of `define-coding-system' for information
7013 about coding-system objects.  */)
7014      (obj)
7015      Lisp_Object obj;
7016 {
7017   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
7018 }
7019
7020 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7021        Sread_non_nil_coding_system, 1, 1, 0,
7022        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7023      (prompt)
7024      Lisp_Object prompt;
7025 {
7026   Lisp_Object val;
7027   do
7028     {
7029       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7030                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7031     }
7032   while (SCHARS (val) == 0);
7033   return (Fintern (val, Qnil));
7034 }
7035
7036 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7037        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7038 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7039      (prompt, default_coding_system)
7040      Lisp_Object prompt, default_coding_system;
7041 {
7042   Lisp_Object val;
7043   if (SYMBOLP (default_coding_system))
7044     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7045   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7046                           Qt, Qnil, Qcoding_system_history,
7047                           default_coding_system, Qnil);
7048   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7049 }
7050
7051 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7052        1, 1, 0,
7053        doc: /* Check validity of CODING-SYSTEM.
7054 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7055 It is valid if it is nil or a symbol defined as a coding system by the
7056 function `define-coding-system'.  */)
7057   (coding_system)
7058      Lisp_Object coding_system;
7059 {
7060   CHECK_SYMBOL (coding_system);
7061   if (!NILP (Fcoding_system_p (coding_system)))
7062     return coding_system;
7063   while (1)
7064     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7065 }
7066
7067 \f
7068 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7069    HIGHEST is nonzero, return the coding system of the highest
7070    priority among the detected coding systems.  Otherwize return a
7071    list of detected coding systems sorted by their priorities.  If
7072    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7073    multibyte form but contains only ASCII and eight-bit chars.
7074    Otherwise, the bytes are raw bytes.
7075
7076    CODING-SYSTEM controls the detection as below:
7077
7078    If it is nil, detect both text-format and eol-format.  If the
7079    text-format part of CODING-SYSTEM is already specified
7080    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7081    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7082    detect only text-format.  */
7083
7084 Lisp_Object
7085 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7086                       coding_system)
7087      const unsigned char *src;
7088      int src_chars, src_bytes, highest;
7089      int multibytep;
7090      Lisp_Object coding_system;
7091 {
7092   const unsigned char *src_end = src + src_bytes;
7093   Lisp_Object attrs, eol_type;
7094   Lisp_Object val;
7095   struct coding_system coding;
7096   int id;
7097   struct coding_detection_info detect_info;
7098   enum coding_category base_category;
7099
7100   if (NILP (coding_system))
7101     coding_system = Qundecided;
7102   setup_coding_system (coding_system, &coding);
7103   attrs = CODING_ID_ATTRS (coding.id);
7104   eol_type = CODING_ID_EOL_TYPE (coding.id);
7105   coding_system = CODING_ATTR_BASE_NAME (attrs);
7106
7107   coding.source = src;
7108   coding.src_chars = src_chars;
7109   coding.src_bytes = src_bytes;
7110   coding.src_multibyte = multibytep;
7111   coding.consumed = 0;
7112   coding.mode |= CODING_MODE_LAST_BLOCK;
7113
7114   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7115
7116   /* At first, detect text-format if necessary.  */
7117   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7118   if (base_category == coding_category_undecided)
7119     {
7120       enum coding_category category;
7121       struct coding_system *this;
7122       int c, i;
7123
7124       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7125       for (i = 0; src < src_end; i++, src++)
7126         {
7127           c = *src;
7128           if (c & 0x80)
7129             break;
7130           if (c < 0x20
7131               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO)
7132               && inhibit_iso_escape_detection)
7133             {
7134               coding.head_ascii = src - coding.source;
7135               if (detect_coding_iso_2022 (&coding, &detect_info))
7136                 {
7137                   /* We have scanned the whole data.  */
7138                   if (! (detect_info.rejected & CATEGORY_MASK_ISO_7_ELSE))
7139                     /* We didn't find an 8-bit code.  */
7140                     src = src_end;
7141                   break;
7142                 }
7143             }
7144         }
7145       coding.head_ascii = src - coding.source;
7146
7147       if (src < src_end
7148           || detect_info.found)
7149         {
7150           if (src == src_end)
7151             /* As all bytes are 7-bit, we can ignore non-ISO-2022 codings.  */
7152             for (i = 0; i < coding_category_raw_text; i++)
7153               {
7154                 category = coding_priorities[i];
7155                 if (detect_info.found & (1 << category))
7156                   break;
7157               }
7158           else
7159             for (i = 0; i < coding_category_raw_text; i++)
7160               {
7161                 category = coding_priorities[i];
7162                 this = coding_categories + category;
7163
7164                 if (this->id < 0)
7165                   {
7166                     /* No coding system of this category is defined.  */
7167                     detect_info.rejected |= (1 << category);
7168                   }
7169                 else if (category >= coding_category_raw_text)
7170                   continue;
7171                 else if (detect_info.checked & (1 << category))
7172                   {
7173                     if (highest
7174                         && (detect_info.found & (1 << category)))
7175                       break;
7176                   }
7177                 else
7178                   {
7179                     if ((*(this->detector)) (&coding, &detect_info)
7180                         && highest
7181                         && (detect_info.found & (1 << category)))
7182                       {
7183                         if (category == coding_category_utf_16_auto)
7184                           {
7185                             if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7186                               category = coding_category_utf_16_le;
7187                             else
7188                               category = coding_category_utf_16_be;
7189                           }
7190                         break;
7191                       }
7192                   }
7193               }
7194         }
7195
7196       if (detect_info.rejected == CATEGORY_MASK_ANY)
7197         {
7198           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7199           id = coding_categories[coding_category_raw_text].id;
7200           val = Fcons (make_number (id), Qnil);
7201         }
7202       else if (! detect_info.rejected && ! detect_info.found)
7203         {
7204           detect_info.found = CATEGORY_MASK_ANY;
7205           id = coding_categories[coding_category_undecided].id;
7206           val = Fcons (make_number (id), Qnil);
7207         }
7208       else if (highest)
7209         {
7210           if (detect_info.found)
7211             {
7212               detect_info.found = 1 << category;
7213               val = Fcons (make_number (this->id), Qnil);
7214             }
7215           else
7216             for (i = 0; i < coding_category_raw_text; i++)
7217               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7218                 {
7219                   detect_info.found = 1 << coding_priorities[i];
7220                   id = coding_categories[coding_priorities[i]].id;
7221                   val = Fcons (make_number (id), Qnil);
7222                   break;
7223                 }
7224         }
7225       else
7226         {
7227           int mask = detect_info.rejected | detect_info.found;
7228           int found = 0;
7229           val = Qnil;
7230
7231           for (i = coding_category_raw_text - 1; i >= 0; i--)
7232             {
7233               category = coding_priorities[i];
7234               if (! (mask & (1 << category)))
7235                 {
7236                   found |= 1 << category;
7237                   id = coding_categories[category].id;
7238                   val = Fcons (make_number (id), val);
7239                 }
7240             }
7241           for (i = coding_category_raw_text - 1; i >= 0; i--)
7242             {
7243               category = coding_priorities[i];
7244               if (detect_info.found & (1 << category))
7245                 {
7246                   id = coding_categories[category].id;
7247                   val = Fcons (make_number (id), val);
7248                 }
7249             }
7250           detect_info.found |= found;
7251         }
7252     }
7253   else if (base_category == coding_category_utf_16_auto)
7254     {
7255       if (detect_coding_utf_16 (&coding, &detect_info))
7256         {
7257           struct coding_system *this;
7258
7259           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7260             this = coding_categories + coding_category_utf_16_le;
7261           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7262             this = coding_categories + coding_category_utf_16_be;
7263           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7264             this = coding_categories + coding_category_utf_16_be_nosig;
7265           else
7266             this = coding_categories + coding_category_utf_16_le_nosig;
7267           val = Fcons (make_number (this->id), Qnil);
7268         }
7269     }
7270   else
7271     {
7272       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7273       val = Fcons (make_number (coding.id), Qnil);
7274     }
7275
7276   /* Then, detect eol-format if necessary.  */
7277   {
7278     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7279     Lisp_Object tail;
7280
7281     if (VECTORP (eol_type))
7282       {
7283         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7284           normal_eol = detect_eol (coding.source, src_bytes,
7285                                    coding_category_raw_text);
7286         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7287                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7288           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7289                                       coding_category_utf_16_be);
7290         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7291                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7292           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7293                                       coding_category_utf_16_le);
7294       }
7295     else
7296       {
7297         if (EQ (eol_type, Qunix))
7298           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7299         else if (EQ (eol_type, Qdos))
7300           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7301         else
7302           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7303       }
7304
7305     for (tail = val; CONSP (tail); tail = XCDR (tail))
7306       {
7307         enum coding_category category;
7308         int this_eol;
7309
7310         id = XINT (XCAR (tail));
7311         attrs = CODING_ID_ATTRS (id);
7312         category = XINT (CODING_ATTR_CATEGORY (attrs));
7313         eol_type = CODING_ID_EOL_TYPE (id);
7314         if (VECTORP (eol_type))
7315           {
7316             if (category == coding_category_utf_16_be
7317                 || category == coding_category_utf_16_be_nosig)
7318               this_eol = utf_16_be_eol;
7319             else if (category == coding_category_utf_16_le
7320                      || category == coding_category_utf_16_le_nosig)
7321               this_eol = utf_16_le_eol;
7322             else
7323               this_eol = normal_eol;
7324
7325             if (this_eol == EOL_SEEN_LF)
7326               XSETCAR (tail, AREF (eol_type, 0));
7327             else if (this_eol == EOL_SEEN_CRLF)
7328               XSETCAR (tail, AREF (eol_type, 1));
7329             else if (this_eol == EOL_SEEN_CR)
7330               XSETCAR (tail, AREF (eol_type, 2));
7331             else
7332               XSETCAR (tail, CODING_ID_NAME (id));
7333           }
7334         else
7335           XSETCAR (tail, CODING_ID_NAME (id));
7336       }
7337   }
7338
7339   return (highest ? XCAR (val) : val);
7340 }
7341
7342
7343 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7344        2, 3, 0,
7345        doc: /* Detect coding system of the text in the region between START and END.
7346 Return a list of possible coding systems ordered by priority.
7347
7348 If only ASCII characters are found, it returns a list of single element
7349 `undecided' or its subsidiary coding system according to a detected
7350 end-of-line format.
7351
7352 If optional argument HIGHEST is non-nil, return the coding system of
7353 highest priority.  */)
7354      (start, end, highest)
7355      Lisp_Object start, end, highest;
7356 {
7357   int from, to;
7358   int from_byte, to_byte;
7359
7360   CHECK_NUMBER_COERCE_MARKER (start);
7361   CHECK_NUMBER_COERCE_MARKER (end);
7362
7363   validate_region (&start, &end);
7364   from = XINT (start), to = XINT (end);
7365   from_byte = CHAR_TO_BYTE (from);
7366   to_byte = CHAR_TO_BYTE (to);
7367
7368   if (from < GPT && to >= GPT)
7369     move_gap_both (to, to_byte);
7370
7371   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7372                                to - from, to_byte - from_byte,
7373                                !NILP (highest),
7374                                !NILP (current_buffer
7375                                       ->enable_multibyte_characters),
7376                                Qnil);
7377 }
7378
7379 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7380        1, 2, 0,
7381        doc: /* Detect coding system of the text in STRING.
7382 Return a list of possible coding systems ordered by priority.
7383
7384 If only ASCII characters are found, it returns a list of single element
7385 `undecided' or its subsidiary coding system according to a detected
7386 end-of-line format.
7387
7388 If optional argument HIGHEST is non-nil, return the coding system of
7389 highest priority.  */)
7390      (string, highest)
7391      Lisp_Object string, highest;
7392 {
7393   CHECK_STRING (string);
7394
7395   return detect_coding_system (SDATA (string),
7396                                SCHARS (string), SBYTES (string),
7397                                !NILP (highest), STRING_MULTIBYTE (string),
7398                                Qnil);
7399 }
7400
7401
7402 static INLINE int
7403 char_encodable_p (c, attrs)
7404      int c;
7405      Lisp_Object attrs;
7406 {
7407   Lisp_Object tail;
7408   struct charset *charset;
7409   Lisp_Object translation_table;
7410
7411   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7412   if (! NILP (translation_table))
7413     c = translate_char (translation_table, c);
7414   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7415        CONSP (tail); tail = XCDR (tail))
7416     {
7417       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7418       if (CHAR_CHARSET_P (c, charset))
7419         break;
7420     }
7421   return (! NILP (tail));
7422 }
7423
7424
7425 /* Return a list of coding systems that safely encode the text between
7426    START and END.  If EXCLUDE is non-nil, it is a list of coding
7427    systems not to check.  The returned list doesn't contain any such
7428    coding systems.  In any case, if the text contains only ASCII or is
7429    unibyte, return t.  */
7430
7431 DEFUN ("find-coding-systems-region-internal",
7432        Ffind_coding_systems_region_internal,
7433        Sfind_coding_systems_region_internal, 2, 3, 0,
7434        doc: /* Internal use only.  */)
7435      (start, end, exclude)
7436      Lisp_Object start, end, exclude;
7437 {
7438   Lisp_Object coding_attrs_list, safe_codings;
7439   EMACS_INT start_byte, end_byte;
7440   const unsigned char *p, *pbeg, *pend;
7441   int c;
7442   Lisp_Object tail, elt;
7443
7444   if (STRINGP (start))
7445     {
7446       if (!STRING_MULTIBYTE (start)
7447           || SCHARS (start) == SBYTES (start))
7448         return Qt;
7449       start_byte = 0;
7450       end_byte = SBYTES (start);
7451     }
7452   else
7453     {
7454       CHECK_NUMBER_COERCE_MARKER (start);
7455       CHECK_NUMBER_COERCE_MARKER (end);
7456       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7457         args_out_of_range (start, end);
7458       if (NILP (current_buffer->enable_multibyte_characters))
7459         return Qt;
7460       start_byte = CHAR_TO_BYTE (XINT (start));
7461       end_byte = CHAR_TO_BYTE (XINT (end));
7462       if (XINT (end) - XINT (start) == end_byte - start_byte)
7463         return Qt;
7464
7465       if (XINT (start) < GPT && XINT (end) > GPT)
7466         {
7467           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7468             move_gap_both (XINT (start), start_byte);
7469           else
7470             move_gap_both (XINT (end), end_byte);
7471         }
7472     }
7473
7474   coding_attrs_list = Qnil;
7475   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7476     if (NILP (exclude)
7477         || NILP (Fmemq (XCAR (tail), exclude)))
7478       {
7479         Lisp_Object attrs;
7480
7481         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7482         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7483             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7484           {
7485             ASET (attrs, coding_attr_trans_tbl,
7486                   get_translation_table (attrs, 1, NULL));
7487             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7488           }
7489       }
7490
7491   if (STRINGP (start))
7492     p = pbeg = SDATA (start);
7493   else
7494     p = pbeg = BYTE_POS_ADDR (start_byte);
7495   pend = p + (end_byte - start_byte);
7496
7497   while (p < pend && ASCII_BYTE_P (*p)) p++;
7498   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7499
7500   while (p < pend)
7501     {
7502       if (ASCII_BYTE_P (*p))
7503         p++;
7504       else
7505         {
7506           c = STRING_CHAR_ADVANCE (p);
7507
7508           charset_map_loaded = 0;
7509           for (tail = coding_attrs_list; CONSP (tail);)
7510             {
7511               elt = XCAR (tail);
7512               if (NILP (elt))
7513                 tail = XCDR (tail);
7514               else if (char_encodable_p (c, elt))
7515                 tail = XCDR (tail);
7516               else if (CONSP (XCDR (tail)))
7517                 {
7518                   XSETCAR (tail, XCAR (XCDR (tail)));
7519                   XSETCDR (tail, XCDR (XCDR (tail)));
7520                 }
7521               else
7522                 {
7523                   XSETCAR (tail, Qnil);
7524                   tail = XCDR (tail);
7525                 }
7526             }
7527           if (charset_map_loaded)
7528             {
7529               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7530
7531               if (STRINGP (start))
7532                 pbeg = SDATA (start);
7533               else
7534                 pbeg = BYTE_POS_ADDR (start_byte);
7535               p = pbeg + p_offset;
7536               pend = pbeg + pend_offset;
7537             }
7538         }
7539     }
7540
7541   safe_codings = list2 (Qraw_text, Qno_conversion);
7542   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7543     if (! NILP (XCAR (tail)))
7544       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7545
7546   return safe_codings;
7547 }
7548
7549
7550 DEFUN ("unencodable-char-position", Funencodable_char_position,
7551        Sunencodable_char_position, 3, 5, 0,
7552        doc: /*
7553 Return position of first un-encodable character in a region.
7554 START and END specfiy the region and CODING-SYSTEM specifies the
7555 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7556
7557 If optional 4th argument COUNT is non-nil, it specifies at most how
7558 many un-encodable characters to search.  In this case, the value is a
7559 list of positions.
7560
7561 If optional 5th argument STRING is non-nil, it is a string to search
7562 for un-encodable characters.  In that case, START and END are indexes
7563 to the string.  */)
7564      (start, end, coding_system, count, string)
7565      Lisp_Object start, end, coding_system, count, string;
7566 {
7567   int n;
7568   struct coding_system coding;
7569   Lisp_Object attrs, charset_list, translation_table;
7570   Lisp_Object positions;
7571   int from, to;
7572   const unsigned char *p, *stop, *pend;
7573   int ascii_compatible;
7574
7575   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7576   attrs = CODING_ID_ATTRS (coding.id);
7577   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7578     return Qnil;
7579   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7580   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7581   translation_table = get_translation_table (attrs, 1, NULL);
7582
7583   if (NILP (string))
7584     {
7585       validate_region (&start, &end);
7586       from = XINT (start);
7587       to = XINT (end);
7588       if (NILP (current_buffer->enable_multibyte_characters)
7589           || (ascii_compatible
7590               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7591         return Qnil;
7592       p = CHAR_POS_ADDR (from);
7593       pend = CHAR_POS_ADDR (to);
7594       if (from < GPT && to >= GPT)
7595         stop = GPT_ADDR;
7596       else
7597         stop = pend;
7598     }
7599   else
7600     {
7601       CHECK_STRING (string);
7602       CHECK_NATNUM (start);
7603       CHECK_NATNUM (end);
7604       from = XINT (start);
7605       to = XINT (end);
7606       if (from > to
7607           || to > SCHARS (string))
7608         args_out_of_range_3 (string, start, end);
7609       if (! STRING_MULTIBYTE (string))
7610         return Qnil;
7611       p = SDATA (string) + string_char_to_byte (string, from);
7612       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7613       if (ascii_compatible && (to - from) == (pend - p))
7614         return Qnil;
7615     }
7616
7617   if (NILP (count))
7618     n = 1;
7619   else
7620     {
7621       CHECK_NATNUM (count);
7622       n = XINT (count);
7623     }
7624
7625   positions = Qnil;
7626   while (1)
7627     {
7628       int c;
7629
7630       if (ascii_compatible)
7631         while (p < stop && ASCII_BYTE_P (*p))
7632           p++, from++;
7633       if (p >= stop)
7634         {
7635           if (p >= pend)
7636             break;
7637           stop = pend;
7638           p = GAP_END_ADDR;
7639         }
7640
7641       c = STRING_CHAR_ADVANCE (p);
7642       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7643           && ! char_charset (translate_char (translation_table, c),
7644                              charset_list, NULL))
7645         {
7646           positions = Fcons (make_number (from), positions);
7647           n--;
7648           if (n == 0)
7649             break;
7650         }
7651
7652       from++;
7653     }
7654
7655   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7656 }
7657
7658
7659 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7660        Scheck_coding_systems_region, 3, 3, 0,
7661        doc: /* Check if the region is encodable by coding systems.
7662
7663 START and END are buffer positions specifying the region.
7664 CODING-SYSTEM-LIST is a list of coding systems to check.
7665
7666 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7667 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7668 whole region, POS0, POS1, ... are buffer positions where non-encodable
7669 characters are found.
7670
7671 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7672 value is nil.
7673
7674 START may be a string.  In that case, check if the string is
7675 encodable, and the value contains indices to the string instead of
7676 buffer positions.  END is ignored.  */)
7677      (start, end, coding_system_list)
7678      Lisp_Object start, end, coding_system_list;
7679 {
7680   Lisp_Object list;
7681   EMACS_INT start_byte, end_byte;
7682   int pos;
7683   const unsigned char *p, *pbeg, *pend;
7684   int c;
7685   Lisp_Object tail, elt, attrs;
7686
7687   if (STRINGP (start))
7688     {
7689       if (!STRING_MULTIBYTE (start)
7690           && SCHARS (start) != SBYTES (start))
7691         return Qnil;
7692       start_byte = 0;
7693       end_byte = SBYTES (start);
7694       pos = 0;
7695     }
7696   else
7697     {
7698       CHECK_NUMBER_COERCE_MARKER (start);
7699       CHECK_NUMBER_COERCE_MARKER (end);
7700       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7701         args_out_of_range (start, end);
7702       if (NILP (current_buffer->enable_multibyte_characters))
7703         return Qnil;
7704       start_byte = CHAR_TO_BYTE (XINT (start));
7705       end_byte = CHAR_TO_BYTE (XINT (end));
7706       if (XINT (end) - XINT (start) == end_byte - start_byte)
7707         return Qt;
7708
7709       if (XINT (start) < GPT && XINT (end) > GPT)
7710         {
7711           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7712             move_gap_both (XINT (start), start_byte);
7713           else
7714             move_gap_both (XINT (end), end_byte);
7715         }
7716       pos = XINT (start);
7717     }
7718
7719   list = Qnil;
7720   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7721     {
7722       elt = XCAR (tail);
7723       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7724       ASET (attrs, coding_attr_trans_tbl,
7725             get_translation_table (attrs, 1, NULL));
7726       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7727     }
7728
7729   if (STRINGP (start))
7730     p = pbeg = SDATA (start);
7731   else
7732     p = pbeg = BYTE_POS_ADDR (start_byte);
7733   pend = p + (end_byte - start_byte);
7734
7735   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7736   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7737
7738   while (p < pend)
7739     {
7740       if (ASCII_BYTE_P (*p))
7741         p++;
7742       else
7743         {
7744           c = STRING_CHAR_ADVANCE (p);
7745
7746           charset_map_loaded = 0;
7747           for (tail = list; CONSP (tail); tail = XCDR (tail))
7748             {
7749               elt = XCDR (XCAR (tail));
7750               if (! char_encodable_p (c, XCAR (elt)))
7751                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7752             }
7753           if (charset_map_loaded)
7754             {
7755               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7756
7757               if (STRINGP (start))
7758                 pbeg = SDATA (start);
7759               else
7760                 pbeg = BYTE_POS_ADDR (start_byte);
7761               p = pbeg + p_offset;
7762               pend = pbeg + pend_offset;
7763             }
7764         }
7765       pos++;
7766     }
7767
7768   tail = list;
7769   list = Qnil;
7770   for (; CONSP (tail); tail = XCDR (tail))
7771     {
7772       elt = XCAR (tail);
7773       if (CONSP (XCDR (XCDR (elt))))
7774         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7775                       list);
7776     }
7777
7778   return list;
7779 }
7780
7781
7782 Lisp_Object
7783 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7784      Lisp_Object start, end, coding_system, dst_object;
7785      int encodep, norecord;
7786 {
7787   struct coding_system coding;
7788   EMACS_INT from, from_byte, to, to_byte;
7789   Lisp_Object src_object;
7790
7791   CHECK_NUMBER_COERCE_MARKER (start);
7792   CHECK_NUMBER_COERCE_MARKER (end);
7793   if (NILP (coding_system))
7794     coding_system = Qno_conversion;
7795   else
7796     CHECK_CODING_SYSTEM (coding_system);
7797   src_object = Fcurrent_buffer ();
7798   if (NILP (dst_object))
7799     dst_object = src_object;
7800   else if (! EQ (dst_object, Qt))
7801     CHECK_BUFFER (dst_object);
7802
7803   validate_region (&start, &end);
7804   from = XFASTINT (start);
7805   from_byte = CHAR_TO_BYTE (from);
7806   to = XFASTINT (end);
7807   to_byte = CHAR_TO_BYTE (to);
7808
7809   setup_coding_system (coding_system, &coding);
7810   coding.mode |= CODING_MODE_LAST_BLOCK;
7811
7812   if (encodep)
7813     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7814                           dst_object);
7815   else
7816     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7817                           dst_object);
7818   if (! norecord)
7819     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7820
7821   return (BUFFERP (dst_object)
7822           ? make_number (coding.produced_char)
7823           : coding.dst_object);
7824 }
7825
7826
7827 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7828        3, 4, "r\nzCoding system: ",
7829        doc: /* Decode the current region from the specified coding system.
7830 When called from a program, takes four arguments:
7831         START, END, CODING-SYSTEM, and DESTINATION.
7832 START and END are buffer positions.
7833
7834 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7835 If nil, the region between START and END is replace by the decoded text.
7836 If buffer, the decoded text is inserted in the buffer.
7837 If t, the decoded text is returned.
7838
7839 This function sets `last-coding-system-used' to the precise coding system
7840 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7841 not fully specified.)
7842 It returns the length of the decoded text.  */)
7843      (start, end, coding_system, destination)
7844      Lisp_Object start, end, coding_system, destination;
7845 {
7846   return code_convert_region (start, end, coding_system, destination, 0, 0);
7847 }
7848
7849 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7850        3, 4, "r\nzCoding system: ",
7851        doc: /* Encode the current region by specified coding system.
7852 When called from a program, takes three arguments:
7853 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7854
7855 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7856 If nil, the region between START and END is replace by the encoded text.
7857 If buffer, the encoded text is inserted in the buffer.
7858 If t, the encoded text is returned.
7859
7860 This function sets `last-coding-system-used' to the precise coding system
7861 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7862 not fully specified.)
7863 It returns the length of the encoded text.  */)
7864   (start, end, coding_system, destination)
7865      Lisp_Object start, end, coding_system, destination;
7866 {
7867   return code_convert_region (start, end, coding_system, destination, 1, 0);
7868 }
7869
7870 Lisp_Object
7871 code_convert_string (string, coding_system, dst_object,
7872                      encodep, nocopy, norecord)
7873      Lisp_Object string, coding_system, dst_object;
7874      int encodep, nocopy, norecord;
7875 {
7876   struct coding_system coding;
7877   EMACS_INT chars, bytes;
7878
7879   CHECK_STRING (string);
7880   if (NILP (coding_system))
7881     {
7882       if (! norecord)
7883         Vlast_coding_system_used = Qno_conversion;
7884       if (NILP (dst_object))
7885         return (nocopy ? Fcopy_sequence (string) : string);
7886     }
7887
7888   if (NILP (coding_system))
7889     coding_system = Qno_conversion;
7890   else
7891     CHECK_CODING_SYSTEM (coding_system);
7892   if (NILP (dst_object))
7893     dst_object = Qt;
7894   else if (! EQ (dst_object, Qt))
7895     CHECK_BUFFER (dst_object);
7896
7897   setup_coding_system (coding_system, &coding);
7898   coding.mode |= CODING_MODE_LAST_BLOCK;
7899   chars = SCHARS (string);
7900   bytes = SBYTES (string);
7901   if (encodep)
7902     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7903   else
7904     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7905   if (! norecord)
7906     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7907
7908   return (BUFFERP (dst_object)
7909           ? make_number (coding.produced_char)
7910           : coding.dst_object);
7911 }
7912
7913
7914 /* Encode or decode STRING according to CODING_SYSTEM.
7915    Do not set Vlast_coding_system_used.
7916
7917    This function is called only from macros DECODE_FILE and
7918    ENCODE_FILE, thus we ignore character composition.  */
7919
7920 Lisp_Object
7921 code_convert_string_norecord (string, coding_system, encodep)
7922      Lisp_Object string, coding_system;
7923      int encodep;
7924 {
7925   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7926 }
7927
7928
7929 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7930        2, 4, 0,
7931        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7932
7933 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7934 if the decoding operation is trivial.
7935
7936 Optional fourth arg BUFFER non-nil meant that the decoded text is
7937 inserted in BUFFER instead of returned as a string.  In this case,
7938 the return value is BUFFER.
7939
7940 This function sets `last-coding-system-used' to the precise coding system
7941 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7942 not fully specified.  */)
7943   (string, coding_system, nocopy, buffer)
7944      Lisp_Object string, coding_system, nocopy, buffer;
7945 {
7946   return code_convert_string (string, coding_system, buffer,
7947                               0, ! NILP (nocopy), 0);
7948 }
7949
7950 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7951        2, 4, 0,
7952        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7953
7954 Optional third arg NOCOPY non-nil means it is OK to return STRING
7955 itself if the encoding operation is trivial.
7956
7957 Optional fourth arg BUFFER non-nil meant that the encoded text is
7958 inserted in BUFFER instead of returned as a string.  In this case,
7959 the return value is BUFFER.
7960
7961 This function sets `last-coding-system-used' to the precise coding system
7962 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7963 not fully specified.)  */)
7964      (string, coding_system, nocopy, buffer)
7965      Lisp_Object string, coding_system, nocopy, buffer;
7966 {
7967   return code_convert_string (string, coding_system, buffer,
7968                               1, ! NILP (nocopy), 1);
7969 }
7970
7971 \f
7972 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7973        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7974 Return the corresponding character.  */)
7975      (code)
7976      Lisp_Object code;
7977 {
7978   Lisp_Object spec, attrs, val;
7979   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7980   int c;
7981
7982   CHECK_NATNUM (code);
7983   c = XFASTINT (code);
7984   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7985   attrs = AREF (spec, 0);
7986
7987   if (ASCII_BYTE_P (c)
7988       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7989     return code;
7990
7991   val = CODING_ATTR_CHARSET_LIST (attrs);
7992   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7993   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7994   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7995
7996   if (c <= 0x7F)
7997     charset = charset_roman;
7998   else if (c >= 0xA0 && c < 0xDF)
7999     {
8000       charset = charset_kana;
8001       c -= 0x80;
8002     }
8003   else
8004     {
8005       int s1 = c >> 8, s2 = c & 0xFF;
8006
8007       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
8008           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
8009         error ("Invalid code: %d", code);
8010       SJIS_TO_JIS (c);
8011       charset = charset_kanji;
8012     }
8013   c = DECODE_CHAR (charset, c);
8014   if (c < 0)
8015     error ("Invalid code: %d", code);
8016   return make_number (c);
8017 }
8018
8019
8020 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
8021        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
8022 Return the corresponding code in SJIS.  */)
8023      (ch)
8024     Lisp_Object ch;
8025 {
8026   Lisp_Object spec, attrs, charset_list;
8027   int c;
8028   struct charset *charset;
8029   unsigned code;
8030
8031   CHECK_CHARACTER (ch);
8032   c = XFASTINT (ch);
8033   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8034   attrs = AREF (spec, 0);
8035
8036   if (ASCII_CHAR_P (c)
8037       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8038     return ch;
8039
8040   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8041   charset = char_charset (c, charset_list, &code);
8042   if (code == CHARSET_INVALID_CODE (charset))
8043     error ("Can't encode by shift_jis encoding: %d", c);
8044   JIS_TO_SJIS (code);
8045
8046   return make_number (code);
8047 }
8048
8049 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8050        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8051 Return the corresponding character.  */)
8052      (code)
8053      Lisp_Object code;
8054 {
8055   Lisp_Object spec, attrs, val;
8056   struct charset *charset_roman, *charset_big5, *charset;
8057   int c;
8058
8059   CHECK_NATNUM (code);
8060   c = XFASTINT (code);
8061   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8062   attrs = AREF (spec, 0);
8063
8064   if (ASCII_BYTE_P (c)
8065       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8066     return code;
8067
8068   val = CODING_ATTR_CHARSET_LIST (attrs);
8069   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8070   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8071
8072   if (c <= 0x7F)
8073     charset = charset_roman;
8074   else
8075     {
8076       int b1 = c >> 8, b2 = c & 0x7F;
8077       if (b1 < 0xA1 || b1 > 0xFE
8078           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8079         error ("Invalid code: %d", code);
8080       charset = charset_big5;
8081     }
8082   c = DECODE_CHAR (charset, (unsigned )c);
8083   if (c < 0)
8084     error ("Invalid code: %d", code);
8085   return make_number (c);
8086 }
8087
8088 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8089        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8090 Return the corresponding character code in Big5.  */)
8091      (ch)
8092      Lisp_Object ch;
8093 {
8094   Lisp_Object spec, attrs, charset_list;
8095   struct charset *charset;
8096   int c;
8097   unsigned code;
8098
8099   CHECK_CHARACTER (ch);
8100   c = XFASTINT (ch);
8101   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8102   attrs = AREF (spec, 0);
8103   if (ASCII_CHAR_P (c)
8104       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8105     return ch;
8106
8107   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8108   charset = char_charset (c, charset_list, &code);
8109   if (code == CHARSET_INVALID_CODE (charset))
8110     error ("Can't encode by Big5 encoding: %d", c);
8111
8112   return make_number (code);
8113 }
8114
8115 \f
8116 DEFUN ("set-terminal-coding-system-internal",
8117        Fset_terminal_coding_system_internal,
8118        Sset_terminal_coding_system_internal, 1, 1, 0,
8119        doc: /* Internal use only.  */)
8120      (coding_system)
8121      Lisp_Object coding_system;
8122 {
8123   CHECK_SYMBOL (coding_system);
8124   setup_coding_system (Fcheck_coding_system (coding_system),
8125                         &terminal_coding);
8126
8127   /* We had better not send unsafe characters to terminal.  */
8128   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8129   /* Characer composition should be disabled.  */
8130   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8131   terminal_coding.src_multibyte = 1;
8132   terminal_coding.dst_multibyte = 0;
8133   return Qnil;
8134 }
8135
8136 DEFUN ("set-safe-terminal-coding-system-internal",
8137        Fset_safe_terminal_coding_system_internal,
8138        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8139        doc: /* Internal use only.  */)
8140      (coding_system)
8141      Lisp_Object coding_system;
8142 {
8143   CHECK_SYMBOL (coding_system);
8144   setup_coding_system (Fcheck_coding_system (coding_system),
8145                        &safe_terminal_coding);
8146   /* Characer composition should be disabled.  */
8147   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8148   safe_terminal_coding.src_multibyte = 1;
8149   safe_terminal_coding.dst_multibyte = 0;
8150   return Qnil;
8151 }
8152
8153 DEFUN ("terminal-coding-system",
8154        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8155        doc: /* Return coding system specified for terminal output.  */)
8156      ()
8157 {
8158   return CODING_ID_NAME (terminal_coding.id);
8159 }
8160
8161 DEFUN ("set-keyboard-coding-system-internal",
8162        Fset_keyboard_coding_system_internal,
8163        Sset_keyboard_coding_system_internal, 1, 1, 0,
8164        doc: /* Internal use only.  */)
8165      (coding_system)
8166      Lisp_Object coding_system;
8167 {
8168   CHECK_SYMBOL (coding_system);
8169   setup_coding_system (Fcheck_coding_system (coding_system),
8170                        &keyboard_coding);
8171   /* Characer composition should be disabled.  */
8172   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8173   return Qnil;
8174 }
8175
8176 DEFUN ("keyboard-coding-system",
8177        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8178        doc: /* Return coding system specified for decoding keyboard input.  */)
8179      ()
8180 {
8181   return CODING_ID_NAME (keyboard_coding.id);
8182 }
8183
8184 \f
8185 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8186        Sfind_operation_coding_system,  1, MANY, 0,
8187        doc: /* Choose a coding system for an operation based on the target name.
8188 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8189 DECODING-SYSTEM is the coding system to use for decoding
8190 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8191 for encoding (in case OPERATION does encoding).
8192
8193 The first argument OPERATION specifies an I/O primitive:
8194   For file I/O, `insert-file-contents' or `write-region'.
8195   For process I/O, `call-process', `call-process-region', or `start-process'.
8196   For network I/O, `open-network-stream'.
8197
8198 The remaining arguments should be the same arguments that were passed
8199 to the primitive.  Depending on which primitive, one of those arguments
8200 is selected as the TARGET.  For example, if OPERATION does file I/O,
8201 whichever argument specifies the file name is TARGET.
8202
8203 TARGET has a meaning which depends on OPERATION:
8204   For file I/O, TARGET is a file name.
8205   For process I/O, TARGET is a process name.
8206   For network I/O, TARGET is a service name or a port number
8207
8208 This function looks up what specified for TARGET in,
8209 `file-coding-system-alist', `process-coding-system-alist',
8210 or `network-coding-system-alist' depending on OPERATION.
8211 They may specify a coding system, a cons of coding systems,
8212 or a function symbol to call.
8213 In the last case, we call the function with one argument,
8214 which is a list of all the arguments given to this function.
8215
8216 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8217      (nargs, args)
8218      int nargs;
8219      Lisp_Object *args;
8220 {
8221   Lisp_Object operation, target_idx, target, val;
8222   register Lisp_Object chain;
8223
8224   if (nargs < 2)
8225     error ("Too few arguments");
8226   operation = args[0];
8227   if (!SYMBOLP (operation)
8228       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8229     error ("Invalid first arguement");
8230   if (nargs < 1 + XINT (target_idx))
8231     error ("Too few arguments for operation: %s",
8232            SDATA (SYMBOL_NAME (operation)));
8233   target = args[XINT (target_idx) + 1];
8234   if (!(STRINGP (target)
8235         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8236     error ("Invalid %dth argument", XINT (target_idx) + 1);
8237
8238   chain = ((EQ (operation, Qinsert_file_contents)
8239             || EQ (operation, Qwrite_region))
8240            ? Vfile_coding_system_alist
8241            : (EQ (operation, Qopen_network_stream)
8242               ? Vnetwork_coding_system_alist
8243               : Vprocess_coding_system_alist));
8244   if (NILP (chain))
8245     return Qnil;
8246
8247   for (; CONSP (chain); chain = XCDR (chain))
8248     {
8249       Lisp_Object elt;
8250
8251       elt = XCAR (chain);
8252       if (CONSP (elt)
8253           && ((STRINGP (target)
8254                && STRINGP (XCAR (elt))
8255                && fast_string_match (XCAR (elt), target) >= 0)
8256               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8257         {
8258           val = XCDR (elt);
8259           /* Here, if VAL is both a valid coding system and a valid
8260              function symbol, we return VAL as a coding system.  */
8261           if (CONSP (val))
8262             return val;
8263           if (! SYMBOLP (val))
8264             return Qnil;
8265           if (! NILP (Fcoding_system_p (val)))
8266             return Fcons (val, val);
8267           if (! NILP (Ffboundp (val)))
8268             {
8269               val = call1 (val, Flist (nargs, args));
8270               if (CONSP (val))
8271                 return val;
8272               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8273                 return Fcons (val, val);
8274             }
8275           return Qnil;
8276         }
8277     }
8278   return Qnil;
8279 }
8280
8281 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8282        Sset_coding_system_priority, 0, MANY, 0,
8283        doc: /* Assign higher priority to the coding systems given as arguments.
8284 If multiple coding systems belongs to the same category,
8285 all but the first one are ignored.
8286
8287 usage: (set-coding-system-priority ...)  */)
8288      (nargs, args)
8289      int nargs;
8290      Lisp_Object *args;
8291 {
8292   int i, j;
8293   int changed[coding_category_max];
8294   enum coding_category priorities[coding_category_max];
8295
8296   bzero (changed, sizeof changed);
8297
8298   for (i = j = 0; i < nargs; i++)
8299     {
8300       enum coding_category category;
8301       Lisp_Object spec, attrs;
8302
8303       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8304       attrs = AREF (spec, 0);
8305       category = XINT (CODING_ATTR_CATEGORY (attrs));
8306       if (changed[category])
8307         /* Ignore this coding system because a coding system of the
8308            same category already had a higher priority.  */
8309         continue;
8310       changed[category] = 1;
8311       priorities[j++] = category;
8312       if (coding_categories[category].id >= 0
8313           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8314         setup_coding_system (args[i], &coding_categories[category]);
8315       Fset (AREF (Vcoding_category_table, category), args[i]);
8316     }
8317
8318   /* Now we have decided top J priorities.  Reflect the order of the
8319      original priorities to the remaining priorities.  */
8320
8321   for (i = j, j = 0; i < coding_category_max; i++, j++)
8322     {
8323       while (j < coding_category_max
8324              && changed[coding_priorities[j]])
8325         j++;
8326       if (j == coding_category_max)
8327         abort ();
8328       priorities[i] = coding_priorities[j];
8329     }
8330
8331   bcopy (priorities, coding_priorities, sizeof priorities);
8332
8333   /* Update `coding-category-list'.  */
8334   Vcoding_category_list = Qnil;
8335   for (i = coding_category_max - 1; i >= 0; i--)
8336     Vcoding_category_list
8337       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8338                Vcoding_category_list);
8339
8340   return Qnil;
8341 }
8342
8343 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8344        Scoding_system_priority_list, 0, 1, 0,
8345        doc: /* Return a list of coding systems ordered by their priorities.
8346 HIGHESTP non-nil means just return the highest priority one.  */)
8347      (highestp)
8348      Lisp_Object highestp;
8349 {
8350   int i;
8351   Lisp_Object val;
8352
8353   for (i = 0, val = Qnil; i < coding_category_max; i++)
8354     {
8355       enum coding_category category = coding_priorities[i];
8356       int id = coding_categories[category].id;
8357       Lisp_Object attrs;
8358
8359       if (id < 0)
8360         continue;
8361       attrs = CODING_ID_ATTRS (id);
8362       if (! NILP (highestp))
8363         return CODING_ATTR_BASE_NAME (attrs);
8364       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8365     }
8366   return Fnreverse (val);
8367 }
8368
8369 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8370
8371 static Lisp_Object
8372 make_subsidiaries (base)
8373      Lisp_Object base;
8374 {
8375   Lisp_Object subsidiaries;
8376   int base_name_len = SBYTES (SYMBOL_NAME (base));
8377   char *buf = (char *) alloca (base_name_len + 6);
8378   int i;
8379
8380   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8381   subsidiaries = Fmake_vector (make_number (3), Qnil);
8382   for (i = 0; i < 3; i++)
8383     {
8384       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8385       ASET (subsidiaries, i, intern (buf));
8386     }
8387   return subsidiaries;
8388 }
8389
8390
8391 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8392        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8393        doc: /* For internal use only.
8394 usage: (define-coding-system-internal ...)  */)
8395      (nargs, args)
8396      int nargs;
8397      Lisp_Object *args;
8398 {
8399   Lisp_Object name;
8400   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8401   Lisp_Object attrs;            /* Vector of attributes.  */
8402   Lisp_Object eol_type;
8403   Lisp_Object aliases;
8404   Lisp_Object coding_type, charset_list, safe_charsets;
8405   enum coding_category category;
8406   Lisp_Object tail, val;
8407   int max_charset_id = 0;
8408   int i;
8409
8410   if (nargs < coding_arg_max)
8411     goto short_args;
8412
8413   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8414
8415   name = args[coding_arg_name];
8416   CHECK_SYMBOL (name);
8417   CODING_ATTR_BASE_NAME (attrs) = name;
8418
8419   val = args[coding_arg_mnemonic];
8420   if (! STRINGP (val))
8421     CHECK_CHARACTER (val);
8422   CODING_ATTR_MNEMONIC (attrs) = val;
8423
8424   coding_type = args[coding_arg_coding_type];
8425   CHECK_SYMBOL (coding_type);
8426   CODING_ATTR_TYPE (attrs) = coding_type;
8427
8428   charset_list = args[coding_arg_charset_list];
8429   if (SYMBOLP (charset_list))
8430     {
8431       if (EQ (charset_list, Qiso_2022))
8432         {
8433           if (! EQ (coding_type, Qiso_2022))
8434             error ("Invalid charset-list");
8435           charset_list = Viso_2022_charset_list;
8436         }
8437       else if (EQ (charset_list, Qemacs_mule))
8438         {
8439           if (! EQ (coding_type, Qemacs_mule))
8440             error ("Invalid charset-list");
8441           charset_list = Vemacs_mule_charset_list;
8442         }
8443       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8444         if (max_charset_id < XFASTINT (XCAR (tail)))
8445           max_charset_id = XFASTINT (XCAR (tail));
8446     }
8447   else
8448     {
8449       charset_list = Fcopy_sequence (charset_list);
8450       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8451         {
8452           struct charset *charset;
8453
8454           val = Fcar (tail);
8455           CHECK_CHARSET_GET_CHARSET (val, charset);
8456           if (EQ (coding_type, Qiso_2022)
8457               ? CHARSET_ISO_FINAL (charset) < 0
8458               : EQ (coding_type, Qemacs_mule)
8459               ? CHARSET_EMACS_MULE_ID (charset) < 0
8460               : 0)
8461             error ("Can't handle charset `%s'",
8462                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8463
8464           XSETCAR (tail, make_number (charset->id));
8465           if (max_charset_id < charset->id)
8466             max_charset_id = charset->id;
8467         }
8468     }
8469   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8470
8471   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8472                                 make_number (255));
8473   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8474     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8475   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8476
8477   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8478
8479   val = args[coding_arg_decode_translation_table];
8480   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8481     CHECK_SYMBOL (val);
8482   CODING_ATTR_DECODE_TBL (attrs) = val;
8483
8484   val = args[coding_arg_encode_translation_table];
8485   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8486     CHECK_SYMBOL (val);
8487   CODING_ATTR_ENCODE_TBL (attrs) = val;
8488
8489   val = args[coding_arg_post_read_conversion];
8490   CHECK_SYMBOL (val);
8491   CODING_ATTR_POST_READ (attrs) = val;
8492
8493   val = args[coding_arg_pre_write_conversion];
8494   CHECK_SYMBOL (val);
8495   CODING_ATTR_PRE_WRITE (attrs) = val;
8496
8497   val = args[coding_arg_default_char];
8498   if (NILP (val))
8499     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8500   else
8501     {
8502       CHECK_CHARACTER (val);
8503       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8504     }
8505
8506   val = args[coding_arg_for_unibyte];
8507   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8508
8509   val = args[coding_arg_plist];
8510   CHECK_LIST (val);
8511   CODING_ATTR_PLIST (attrs) = val;
8512
8513   if (EQ (coding_type, Qcharset))
8514     {
8515       /* Generate a lisp vector of 256 elements.  Each element is nil,
8516          integer, or a list of charset IDs.
8517
8518          If Nth element is nil, the byte code N is invalid in this
8519          coding system.
8520
8521          If Nth element is a number NUM, N is the first byte of a
8522          charset whose ID is NUM.
8523
8524          If Nth element is a list of charset IDs, N is the first byte
8525          of one of them.  The list is sorted by dimensions of the
8526          charsets.  A charset of smaller dimension comes firtst. */
8527       val = Fmake_vector (make_number (256), Qnil);
8528
8529       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8530         {
8531           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8532           int dim = CHARSET_DIMENSION (charset);
8533           int idx = (dim - 1) * 4;
8534
8535           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8536             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8537
8538           for (i = charset->code_space[idx];
8539                i <= charset->code_space[idx + 1]; i++)
8540             {
8541               Lisp_Object tmp, tmp2;
8542               int dim2;
8543
8544               tmp = AREF (val, i);
8545               if (NILP (tmp))
8546                 tmp = XCAR (tail);
8547               else if (NUMBERP (tmp))
8548                 {
8549                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8550                   if (dim < dim2)
8551                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8552                   else
8553                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8554                 }
8555               else
8556                 {
8557                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8558                     {
8559                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8560                       if (dim < dim2)
8561                         break;
8562                     }
8563                   if (NILP (tmp2))
8564                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8565                   else
8566                     {
8567                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8568                       XSETCAR (tmp2, XCAR (tail));
8569                     }
8570                 }
8571               ASET (val, i, tmp);
8572             }
8573         }
8574       ASET (attrs, coding_attr_charset_valids, val);
8575       category = coding_category_charset;
8576     }
8577   else if (EQ (coding_type, Qccl))
8578     {
8579       Lisp_Object valids;
8580
8581       if (nargs < coding_arg_ccl_max)
8582         goto short_args;
8583
8584       val = args[coding_arg_ccl_decoder];
8585       CHECK_CCL_PROGRAM (val);
8586       if (VECTORP (val))
8587         val = Fcopy_sequence (val);
8588       ASET (attrs, coding_attr_ccl_decoder, val);
8589
8590       val = args[coding_arg_ccl_encoder];
8591       CHECK_CCL_PROGRAM (val);
8592       if (VECTORP (val))
8593         val = Fcopy_sequence (val);
8594       ASET (attrs, coding_attr_ccl_encoder, val);
8595
8596       val = args[coding_arg_ccl_valids];
8597       valids = Fmake_string (make_number (256), make_number (0));
8598       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8599         {
8600           int from, to;
8601
8602           val = Fcar (tail);
8603           if (INTEGERP (val))
8604             {
8605               from = to = XINT (val);
8606               if (from < 0 || from > 255)
8607                 args_out_of_range_3 (val, make_number (0), make_number (255));
8608             }
8609           else
8610             {
8611               CHECK_CONS (val);
8612               CHECK_NATNUM_CAR (val);
8613               CHECK_NATNUM_CDR (val);
8614               from = XINT (XCAR (val));
8615               if (from > 255)
8616                 args_out_of_range_3 (XCAR (val),
8617                                      make_number (0), make_number (255));
8618               to = XINT (XCDR (val));
8619               if (to < from || to > 255)
8620                 args_out_of_range_3 (XCDR (val),
8621                                      XCAR (val), make_number (255));
8622             }
8623           for (i = from; i <= to; i++)
8624             SSET (valids, i, 1);
8625         }
8626       ASET (attrs, coding_attr_ccl_valids, valids);
8627
8628       category = coding_category_ccl;
8629     }
8630   else if (EQ (coding_type, Qutf_16))
8631     {
8632       Lisp_Object bom, endian;
8633
8634       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8635
8636       if (nargs < coding_arg_utf16_max)
8637         goto short_args;
8638
8639       bom = args[coding_arg_utf16_bom];
8640       if (! NILP (bom) && ! EQ (bom, Qt))
8641         {
8642           CHECK_CONS (bom);
8643           val = XCAR (bom);
8644           CHECK_CODING_SYSTEM (val);
8645           val = XCDR (bom);
8646           CHECK_CODING_SYSTEM (val);
8647         }
8648       ASET (attrs, coding_attr_utf_16_bom, bom);
8649
8650       endian = args[coding_arg_utf16_endian];
8651       CHECK_SYMBOL (endian);
8652       if (NILP (endian))
8653         endian = Qbig;
8654       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8655         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8656       ASET (attrs, coding_attr_utf_16_endian, endian);
8657
8658       category = (CONSP (bom)
8659                   ? coding_category_utf_16_auto
8660                   : NILP (bom)
8661                   ? (EQ (endian, Qbig)
8662                      ? coding_category_utf_16_be_nosig
8663                      : coding_category_utf_16_le_nosig)
8664                   : (EQ (endian, Qbig)
8665                      ? coding_category_utf_16_be
8666                      : coding_category_utf_16_le));
8667     }
8668   else if (EQ (coding_type, Qiso_2022))
8669     {
8670       Lisp_Object initial, reg_usage, request, flags;
8671       int i;
8672
8673       if (nargs < coding_arg_iso2022_max)
8674         goto short_args;
8675
8676       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8677       CHECK_VECTOR (initial);
8678       for (i = 0; i < 4; i++)
8679         {
8680           val = Faref (initial, make_number (i));
8681           if (! NILP (val))
8682             {
8683               struct charset *charset;
8684
8685               CHECK_CHARSET_GET_CHARSET (val, charset);
8686               ASET (initial, i, make_number (CHARSET_ID (charset)));
8687               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8688                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8689             }
8690           else
8691             ASET (initial, i, make_number (-1));
8692         }
8693
8694       reg_usage = args[coding_arg_iso2022_reg_usage];
8695       CHECK_CONS (reg_usage);
8696       CHECK_NUMBER_CAR (reg_usage);
8697       CHECK_NUMBER_CDR (reg_usage);
8698
8699       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8700       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8701         {
8702           int id;
8703           Lisp_Object tmp;
8704
8705           val = Fcar (tail);
8706           CHECK_CONS (val);
8707           tmp = XCAR (val);
8708           CHECK_CHARSET_GET_ID (tmp, id);
8709           CHECK_NATNUM_CDR (val);
8710           if (XINT (XCDR (val)) >= 4)
8711             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8712           XSETCAR (val, make_number (id));
8713         }
8714
8715       flags = args[coding_arg_iso2022_flags];
8716       CHECK_NATNUM (flags);
8717       i = XINT (flags);
8718       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8719         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8720
8721       ASET (attrs, coding_attr_iso_initial, initial);
8722       ASET (attrs, coding_attr_iso_usage, reg_usage);
8723       ASET (attrs, coding_attr_iso_request, request);
8724       ASET (attrs, coding_attr_iso_flags, flags);
8725       setup_iso_safe_charsets (attrs);
8726
8727       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8728         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8729                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8730                     ? coding_category_iso_7_else
8731                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8732                     ? coding_category_iso_7
8733                     : coding_category_iso_7_tight);
8734       else
8735         {
8736           int id = XINT (AREF (initial, 1));
8737
8738           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8739                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8740                        || id < 0)
8741                       ? coding_category_iso_8_else
8742                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8743                       ? coding_category_iso_8_1
8744                       : coding_category_iso_8_2);
8745         }
8746       if (category != coding_category_iso_8_1
8747           && category != coding_category_iso_8_2)
8748         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8749     }
8750   else if (EQ (coding_type, Qemacs_mule))
8751     {
8752       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8753         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8754       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8755       category = coding_category_emacs_mule;
8756     }
8757   else if (EQ (coding_type, Qshift_jis))
8758     {
8759
8760       struct charset *charset;
8761
8762       if (XINT (Flength (charset_list)) != 3
8763           && XINT (Flength (charset_list)) != 4)
8764         error ("There should be three or four charsets");
8765
8766       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8767       if (CHARSET_DIMENSION (charset) != 1)
8768         error ("Dimension of charset %s is not one",
8769                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8770       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8771         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8772
8773       charset_list = XCDR (charset_list);
8774       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8775       if (CHARSET_DIMENSION (charset) != 1)
8776         error ("Dimension of charset %s is not one",
8777                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8778
8779       charset_list = XCDR (charset_list);
8780       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8781       if (CHARSET_DIMENSION (charset) != 2)
8782         error ("Dimension of charset %s is not two",
8783                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8784
8785       charset_list = XCDR (charset_list);
8786       if (! NILP (charset_list))
8787         {
8788           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8789           if (CHARSET_DIMENSION (charset) != 2)
8790             error ("Dimension of charset %s is not two",
8791                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8792         }
8793
8794       category = coding_category_sjis;
8795       Vsjis_coding_system = name;
8796     }
8797   else if (EQ (coding_type, Qbig5))
8798     {
8799       struct charset *charset;
8800
8801       if (XINT (Flength (charset_list)) != 2)
8802         error ("There should be just two charsets");
8803
8804       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8805       if (CHARSET_DIMENSION (charset) != 1)
8806         error ("Dimension of charset %s is not one",
8807                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8808       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8809         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8810
8811       charset_list = XCDR (charset_list);
8812       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8813       if (CHARSET_DIMENSION (charset) != 2)
8814         error ("Dimension of charset %s is not two",
8815                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8816
8817       category = coding_category_big5;
8818       Vbig5_coding_system = name;
8819     }
8820   else if (EQ (coding_type, Qraw_text))
8821     {
8822       category = coding_category_raw_text;
8823       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8824     }
8825   else if (EQ (coding_type, Qutf_8))
8826     {
8827       category = coding_category_utf_8;
8828       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8829     }
8830   else if (EQ (coding_type, Qundecided))
8831     category = coding_category_undecided;
8832   else
8833     error ("Invalid coding system type: %s",
8834            SDATA (SYMBOL_NAME (coding_type)));
8835
8836   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8837   CODING_ATTR_PLIST (attrs)
8838     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8839                                 CODING_ATTR_PLIST (attrs)));
8840
8841   eol_type = args[coding_arg_eol_type];
8842   if (! NILP (eol_type)
8843       && ! EQ (eol_type, Qunix)
8844       && ! EQ (eol_type, Qdos)
8845       && ! EQ (eol_type, Qmac))
8846     error ("Invalid eol-type");
8847
8848   aliases = Fcons (name, Qnil);
8849
8850   if (NILP (eol_type))
8851     {
8852       eol_type = make_subsidiaries (name);
8853       for (i = 0; i < 3; i++)
8854         {
8855           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8856
8857           this_name = AREF (eol_type, i);
8858           this_aliases = Fcons (this_name, Qnil);
8859           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8860           this_spec = Fmake_vector (make_number (3), attrs);
8861           ASET (this_spec, 1, this_aliases);
8862           ASET (this_spec, 2, this_eol_type);
8863           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8864           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8865           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8866                                         Vcoding_system_alist);
8867         }
8868     }
8869
8870   spec_vec = Fmake_vector (make_number (3), attrs);
8871   ASET (spec_vec, 1, aliases);
8872   ASET (spec_vec, 2, eol_type);
8873
8874   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8875   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8876   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8877                                 Vcoding_system_alist);
8878
8879   {
8880     int id = coding_categories[category].id;
8881
8882     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8883       setup_coding_system (name, &coding_categories[category]);
8884   }
8885
8886   return Qnil;
8887
8888  short_args:
8889   return Fsignal (Qwrong_number_of_arguments,
8890                   Fcons (intern ("define-coding-system-internal"),
8891                          make_number (nargs)));
8892 }
8893
8894
8895 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8896        3, 3, 0,
8897        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8898   (coding_system, prop, val)
8899      Lisp_Object coding_system, prop, val;
8900 {
8901   Lisp_Object spec, attrs;
8902
8903   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8904   attrs = AREF (spec, 0);
8905   if (EQ (prop, QCmnemonic))
8906     {
8907       if (! STRINGP (val))
8908         CHECK_CHARACTER (val);
8909       CODING_ATTR_MNEMONIC (attrs) = val;
8910     }
8911   else if (EQ (prop, QCdefalut_char))
8912     {
8913       if (NILP (val))
8914         val = make_number (' ');
8915       else
8916         CHECK_CHARACTER (val);
8917       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8918     }
8919   else if (EQ (prop, QCdecode_translation_table))
8920     {
8921       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8922         CHECK_SYMBOL (val);
8923       CODING_ATTR_DECODE_TBL (attrs) = val;
8924     }
8925   else if (EQ (prop, QCencode_translation_table))
8926     {
8927       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8928         CHECK_SYMBOL (val);
8929       CODING_ATTR_ENCODE_TBL (attrs) = val;
8930     }
8931   else if (EQ (prop, QCpost_read_conversion))
8932     {
8933       CHECK_SYMBOL (val);
8934       CODING_ATTR_POST_READ (attrs) = val;
8935     }
8936   else if (EQ (prop, QCpre_write_conversion))
8937     {
8938       CHECK_SYMBOL (val);
8939       CODING_ATTR_PRE_WRITE (attrs) = val;
8940     }
8941
8942   CODING_ATTR_PLIST (attrs)
8943     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8944   return val;
8945 }
8946
8947
8948 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8949        Sdefine_coding_system_alias, 2, 2, 0,
8950        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8951      (alias, coding_system)
8952      Lisp_Object alias, coding_system;
8953 {
8954   Lisp_Object spec, aliases, eol_type;
8955
8956   CHECK_SYMBOL (alias);
8957   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8958   aliases = AREF (spec, 1);
8959   /* ALISES should be a list of length more than zero, and the first
8960      element is a base coding system.  Append ALIAS at the tail of the
8961      list.  */
8962   while (!NILP (XCDR (aliases)))
8963     aliases = XCDR (aliases);
8964   XSETCDR (aliases, Fcons (alias, Qnil));
8965
8966   eol_type = AREF (spec, 2);
8967   if (VECTORP (eol_type))
8968     {
8969       Lisp_Object subsidiaries;
8970       int i;
8971
8972       subsidiaries = make_subsidiaries (alias);
8973       for (i = 0; i < 3; i++)
8974         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8975                                      AREF (eol_type, i));
8976     }
8977
8978   Fputhash (alias, spec, Vcoding_system_hash_table);
8979   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8980   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8981                                 Vcoding_system_alist);
8982
8983   return Qnil;
8984 }
8985
8986 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8987        1, 1, 0,
8988        doc: /* Return the base of CODING-SYSTEM.
8989 Any alias or subsidiary coding system is not a base coding system.  */)
8990   (coding_system)
8991      Lisp_Object coding_system;
8992 {
8993   Lisp_Object spec, attrs;
8994
8995   if (NILP (coding_system))
8996     return (Qno_conversion);
8997   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8998   attrs = AREF (spec, 0);
8999   return CODING_ATTR_BASE_NAME (attrs);
9000 }
9001
9002 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
9003        1, 1, 0,
9004        doc: "Return the property list of CODING-SYSTEM.")
9005      (coding_system)
9006      Lisp_Object coding_system;
9007 {
9008   Lisp_Object spec, attrs;
9009
9010   if (NILP (coding_system))
9011     coding_system = Qno_conversion;
9012   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9013   attrs = AREF (spec, 0);
9014   return CODING_ATTR_PLIST (attrs);
9015 }
9016
9017
9018 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
9019        1, 1, 0,
9020        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
9021      (coding_system)
9022      Lisp_Object coding_system;
9023 {
9024   Lisp_Object spec;
9025
9026   if (NILP (coding_system))
9027     coding_system = Qno_conversion;
9028   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9029   return AREF (spec, 1);
9030 }
9031
9032 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9033        Scoding_system_eol_type, 1, 1, 0,
9034        doc: /* Return eol-type of CODING-SYSTEM.
9035 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9036
9037 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9038 and CR respectively.
9039
9040 A vector value indicates that a format of end-of-line should be
9041 detected automatically.  Nth element of the vector is the subsidiary
9042 coding system whose eol-type is N.  */)
9043      (coding_system)
9044      Lisp_Object coding_system;
9045 {
9046   Lisp_Object spec, eol_type;
9047   int n;
9048
9049   if (NILP (coding_system))
9050     coding_system = Qno_conversion;
9051   if (! CODING_SYSTEM_P (coding_system))
9052     return Qnil;
9053   spec = CODING_SYSTEM_SPEC (coding_system);
9054   eol_type = AREF (spec, 2);
9055   if (VECTORP (eol_type))
9056     return Fcopy_sequence (eol_type);
9057   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9058   return make_number (n);
9059 }
9060
9061 #endif /* emacs */
9062
9063 \f
9064 /*** 9. Post-amble ***/
9065
9066 void
9067 init_coding_once ()
9068 {
9069   int i;
9070
9071   for (i = 0; i < coding_category_max; i++)
9072     {
9073       coding_categories[i].id = -1;
9074       coding_priorities[i] = i;
9075     }
9076
9077   /* ISO2022 specific initialize routine.  */
9078   for (i = 0; i < 0x20; i++)
9079     iso_code_class[i] = ISO_control_0;
9080   for (i = 0x21; i < 0x7F; i++)
9081     iso_code_class[i] = ISO_graphic_plane_0;
9082   for (i = 0x80; i < 0xA0; i++)
9083     iso_code_class[i] = ISO_control_1;
9084   for (i = 0xA1; i < 0xFF; i++)
9085     iso_code_class[i] = ISO_graphic_plane_1;
9086   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9087   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9088   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9089   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9090   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9091   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9092   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9093   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9094   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9095
9096   for (i = 0; i < 256; i++)
9097     {
9098       emacs_mule_bytes[i] = 1;
9099     }
9100   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9101   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9102   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9103   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9104 }
9105
9106 #ifdef emacs
9107
9108 void
9109 syms_of_coding ()
9110 {
9111   staticpro (&Vcoding_system_hash_table);
9112   {
9113     Lisp_Object args[2];
9114     args[0] = QCtest;
9115     args[1] = Qeq;
9116     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9117   }
9118
9119   staticpro (&Vsjis_coding_system);
9120   Vsjis_coding_system = Qnil;
9121
9122   staticpro (&Vbig5_coding_system);
9123   Vbig5_coding_system = Qnil;
9124
9125   staticpro (&Vcode_conversion_reused_workbuf);
9126   Vcode_conversion_reused_workbuf = Qnil;
9127
9128   staticpro (&Vcode_conversion_workbuf_name);
9129   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9130
9131   reused_workbuf_in_use = 0;
9132
9133   DEFSYM (Qcharset, "charset");
9134   DEFSYM (Qtarget_idx, "target-idx");
9135   DEFSYM (Qcoding_system_history, "coding-system-history");
9136   Fset (Qcoding_system_history, Qnil);
9137
9138   /* Target FILENAME is the first argument.  */
9139   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9140   /* Target FILENAME is the third argument.  */
9141   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9142
9143   DEFSYM (Qcall_process, "call-process");
9144   /* Target PROGRAM is the first argument.  */
9145   Fput (Qcall_process, Qtarget_idx, make_number (0));
9146
9147   DEFSYM (Qcall_process_region, "call-process-region");
9148   /* Target PROGRAM is the third argument.  */
9149   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9150
9151   DEFSYM (Qstart_process, "start-process");
9152   /* Target PROGRAM is the third argument.  */
9153   Fput (Qstart_process, Qtarget_idx, make_number (2));
9154
9155   DEFSYM (Qopen_network_stream, "open-network-stream");
9156   /* Target SERVICE is the fourth argument.  */
9157   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9158
9159   DEFSYM (Qcoding_system, "coding-system");
9160   DEFSYM (Qcoding_aliases, "coding-aliases");
9161
9162   DEFSYM (Qeol_type, "eol-type");
9163   DEFSYM (Qunix, "unix");
9164   DEFSYM (Qdos, "dos");
9165
9166   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9167   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9168   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9169   DEFSYM (Qdefault_char, "default-char");
9170   DEFSYM (Qundecided, "undecided");
9171   DEFSYM (Qno_conversion, "no-conversion");
9172   DEFSYM (Qraw_text, "raw-text");
9173
9174   DEFSYM (Qiso_2022, "iso-2022");
9175
9176   DEFSYM (Qutf_8, "utf-8");
9177   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9178
9179   DEFSYM (Qutf_16, "utf-16");
9180   DEFSYM (Qbig, "big");
9181   DEFSYM (Qlittle, "little");
9182
9183   DEFSYM (Qshift_jis, "shift-jis");
9184   DEFSYM (Qbig5, "big5");
9185
9186   DEFSYM (Qcoding_system_p, "coding-system-p");
9187
9188   DEFSYM (Qcoding_system_error, "coding-system-error");
9189   Fput (Qcoding_system_error, Qerror_conditions,
9190         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9191   Fput (Qcoding_system_error, Qerror_message,
9192         build_string ("Invalid coding system"));
9193
9194   /* Intern this now in case it isn't already done.
9195      Setting this variable twice is harmless.
9196      But don't staticpro it here--that is done in alloc.c.  */
9197   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9198
9199   DEFSYM (Qtranslation_table, "translation-table");
9200   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9201   DEFSYM (Qtranslation_table_id, "translation-table-id");
9202   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9203   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9204
9205   DEFSYM (Qvalid_codes, "valid-codes");
9206
9207   DEFSYM (Qemacs_mule, "emacs-mule");
9208
9209   DEFSYM (QCcategory, ":category");
9210   DEFSYM (QCmnemonic, ":mnemonic");
9211   DEFSYM (QCdefalut_char, ":default-char");
9212   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9213   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9214   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9215   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9216
9217   Vcoding_category_table
9218     = Fmake_vector (make_number (coding_category_max), Qnil);
9219   staticpro (&Vcoding_category_table);
9220   /* Followings are target of code detection.  */
9221   ASET (Vcoding_category_table, coding_category_iso_7,
9222         intern ("coding-category-iso-7"));
9223   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9224         intern ("coding-category-iso-7-tight"));
9225   ASET (Vcoding_category_table, coding_category_iso_8_1,
9226         intern ("coding-category-iso-8-1"));
9227   ASET (Vcoding_category_table, coding_category_iso_8_2,
9228         intern ("coding-category-iso-8-2"));
9229   ASET (Vcoding_category_table, coding_category_iso_7_else,
9230         intern ("coding-category-iso-7-else"));
9231   ASET (Vcoding_category_table, coding_category_iso_8_else,
9232         intern ("coding-category-iso-8-else"));
9233   ASET (Vcoding_category_table, coding_category_utf_8,
9234         intern ("coding-category-utf-8"));
9235   ASET (Vcoding_category_table, coding_category_utf_16_be,
9236         intern ("coding-category-utf-16-be"));
9237   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9238         intern ("coding-category-utf-16-auto"));
9239   ASET (Vcoding_category_table, coding_category_utf_16_le,
9240         intern ("coding-category-utf-16-le"));
9241   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9242         intern ("coding-category-utf-16-be-nosig"));
9243   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9244         intern ("coding-category-utf-16-le-nosig"));
9245   ASET (Vcoding_category_table, coding_category_charset,
9246         intern ("coding-category-charset"));
9247   ASET (Vcoding_category_table, coding_category_sjis,
9248         intern ("coding-category-sjis"));
9249   ASET (Vcoding_category_table, coding_category_big5,
9250         intern ("coding-category-big5"));
9251   ASET (Vcoding_category_table, coding_category_ccl,
9252         intern ("coding-category-ccl"));
9253   ASET (Vcoding_category_table, coding_category_emacs_mule,
9254         intern ("coding-category-emacs-mule"));
9255   /* Followings are NOT target of code detection.  */
9256   ASET (Vcoding_category_table, coding_category_raw_text,
9257         intern ("coding-category-raw-text"));
9258   ASET (Vcoding_category_table, coding_category_undecided,
9259         intern ("coding-category-undecided"));
9260
9261   DEFSYM (Qinsufficient_source, "insufficient-source");
9262   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9263   DEFSYM (Qinvalid_source, "invalid-source");
9264   DEFSYM (Qinterrupted, "interrupted");
9265   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9266
9267   defsubr (&Scoding_system_p);
9268   defsubr (&Sread_coding_system);
9269   defsubr (&Sread_non_nil_coding_system);
9270   defsubr (&Scheck_coding_system);
9271   defsubr (&Sdetect_coding_region);
9272   defsubr (&Sdetect_coding_string);
9273   defsubr (&Sfind_coding_systems_region_internal);
9274   defsubr (&Sunencodable_char_position);
9275   defsubr (&Scheck_coding_systems_region);
9276   defsubr (&Sdecode_coding_region);
9277   defsubr (&Sencode_coding_region);
9278   defsubr (&Sdecode_coding_string);
9279   defsubr (&Sencode_coding_string);
9280   defsubr (&Sdecode_sjis_char);
9281   defsubr (&Sencode_sjis_char);
9282   defsubr (&Sdecode_big5_char);
9283   defsubr (&Sencode_big5_char);
9284   defsubr (&Sset_terminal_coding_system_internal);
9285   defsubr (&Sset_safe_terminal_coding_system_internal);
9286   defsubr (&Sterminal_coding_system);
9287   defsubr (&Sset_keyboard_coding_system_internal);
9288   defsubr (&Skeyboard_coding_system);
9289   defsubr (&Sfind_operation_coding_system);
9290   defsubr (&Sset_coding_system_priority);
9291   defsubr (&Sdefine_coding_system_internal);
9292   defsubr (&Sdefine_coding_system_alias);
9293   defsubr (&Scoding_system_put);
9294   defsubr (&Scoding_system_base);
9295   defsubr (&Scoding_system_plist);
9296   defsubr (&Scoding_system_aliases);
9297   defsubr (&Scoding_system_eol_type);
9298   defsubr (&Scoding_system_priority_list);
9299
9300   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9301                doc: /* List of coding systems.
9302
9303 Do not alter the value of this variable manually.  This variable should be
9304 updated by the functions `define-coding-system' and
9305 `define-coding-system-alias'.  */);
9306   Vcoding_system_list = Qnil;
9307
9308   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9309                doc: /* Alist of coding system names.
9310 Each element is one element list of coding system name.
9311 This variable is given to `completing-read' as TABLE argument.
9312
9313 Do not alter the value of this variable manually.  This variable should be
9314 updated by the functions `make-coding-system' and
9315 `define-coding-system-alias'.  */);
9316   Vcoding_system_alist = Qnil;
9317
9318   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9319                doc: /* List of coding-categories (symbols) ordered by priority.
9320
9321 On detecting a coding system, Emacs tries code detection algorithms
9322 associated with each coding-category one by one in this order.  When
9323 one algorithm agrees with a byte sequence of source text, the coding
9324 system bound to the corresponding coding-category is selected.  */);
9325   {
9326     int i;
9327
9328     Vcoding_category_list = Qnil;
9329     for (i = coding_category_max - 1; i >= 0; i--)
9330       Vcoding_category_list
9331         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9332                  Vcoding_category_list);
9333   }
9334
9335   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9336                doc: /* Specify the coding system for read operations.
9337 It is useful to bind this variable with `let', but do not set it globally.
9338 If the value is a coding system, it is used for decoding on read operation.
9339 If not, an appropriate element is used from one of the coding system alists:
9340 There are three such tables, `file-coding-system-alist',
9341 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9342   Vcoding_system_for_read = Qnil;
9343
9344   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9345                doc: /* Specify the coding system for write operations.
9346 Programs bind this variable with `let', but you should not set it globally.
9347 If the value is a coding system, it is used for encoding of output,
9348 when writing it to a file and when sending it to a file or subprocess.
9349
9350 If this does not specify a coding system, an appropriate element
9351 is used from one of the coding system alists:
9352 There are three such tables, `file-coding-system-alist',
9353 `process-coding-system-alist', and `network-coding-system-alist'.
9354 For output to files, if the above procedure does not specify a coding system,
9355 the value of `buffer-file-coding-system' is used.  */);
9356   Vcoding_system_for_write = Qnil;
9357
9358   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9359                doc: /*
9360 Coding system used in the latest file or process I/O.  */);
9361   Vlast_coding_system_used = Qnil;
9362
9363   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9364                doc: /*
9365 Error status of the last code conversion.
9366
9367 When an error was detected in the last code conversion, this variable
9368 is set to one of the following symbols.
9369   `insufficient-source'
9370   `inconsistent-eol'
9371   `invalid-source'
9372   `interrupted'
9373   `insufficient-memory'
9374 When no error was detected, the value doesn't change.  So, to check
9375 the error status of a code conversion by this variable, you must
9376 explicitly set this variable to nil before performing code
9377 conversion.  */);
9378   Vlast_code_conversion_error = Qnil;
9379
9380   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9381                doc: /*
9382 *Non-nil means always inhibit code conversion of end-of-line format.
9383 See info node `Coding Systems' and info node `Text and Binary' concerning
9384 such conversion.  */);
9385   inhibit_eol_conversion = 0;
9386
9387   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9388                doc: /*
9389 Non-nil means process buffer inherits coding system of process output.
9390 Bind it to t if the process output is to be treated as if it were a file
9391 read from some filesystem.  */);
9392   inherit_process_coding_system = 0;
9393
9394   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9395                doc: /*
9396 Alist to decide a coding system to use for a file I/O operation.
9397 The format is ((PATTERN . VAL) ...),
9398 where PATTERN is a regular expression matching a file name,
9399 VAL is a coding system, a cons of coding systems, or a function symbol.
9400 If VAL is a coding system, it is used for both decoding and encoding
9401 the file contents.
9402 If VAL is a cons of coding systems, the car part is used for decoding,
9403 and the cdr part is used for encoding.
9404 If VAL is a function symbol, the function must return a coding system
9405 or a cons of coding systems which are used as above.  The function gets
9406 the arguments with which `find-operation-coding-systems' was called.
9407
9408 See also the function `find-operation-coding-system'
9409 and the variable `auto-coding-alist'.  */);
9410   Vfile_coding_system_alist = Qnil;
9411
9412   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9413                doc: /*
9414 Alist to decide a coding system to use for a process I/O operation.
9415 The format is ((PATTERN . VAL) ...),
9416 where PATTERN is a regular expression matching a program name,
9417 VAL is a coding system, a cons of coding systems, or a function symbol.
9418 If VAL is a coding system, it is used for both decoding what received
9419 from the program and encoding what sent to the program.
9420 If VAL is a cons of coding systems, the car part is used for decoding,
9421 and the cdr part is used for encoding.
9422 If VAL is a function symbol, the function must return a coding system
9423 or a cons of coding systems which are used as above.
9424
9425 See also the function `find-operation-coding-system'.  */);
9426   Vprocess_coding_system_alist = Qnil;
9427
9428   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9429                doc: /*
9430 Alist to decide a coding system to use for a network I/O operation.
9431 The format is ((PATTERN . VAL) ...),
9432 where PATTERN is a regular expression matching a network service name
9433 or is a port number to connect to,
9434 VAL is a coding system, a cons of coding systems, or a function symbol.
9435 If VAL is a coding system, it is used for both decoding what received
9436 from the network stream and encoding what sent to the network stream.
9437 If VAL is a cons of coding systems, the car part is used for decoding,
9438 and the cdr part is used for encoding.
9439 If VAL is a function symbol, the function must return a coding system
9440 or a cons of coding systems which are used as above.
9441
9442 See also the function `find-operation-coding-system'.  */);
9443   Vnetwork_coding_system_alist = Qnil;
9444
9445   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9446                doc: /* Coding system to use with system messages.
9447 Also used for decoding keyboard input on X Window system.  */);
9448   Vlocale_coding_system = Qnil;
9449
9450   /* The eol mnemonics are reset in startup.el system-dependently.  */
9451   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9452                doc: /*
9453 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9454   eol_mnemonic_unix = build_string (":");
9455
9456   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9457                doc: /*
9458 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9459   eol_mnemonic_dos = build_string ("\\");
9460
9461   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9462                doc: /*
9463 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9464   eol_mnemonic_mac = build_string ("/");
9465
9466   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9467                doc: /*
9468 *String displayed in mode line when end-of-line format is not yet determined.  */);
9469   eol_mnemonic_undecided = build_string (":");
9470
9471   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9472                doc: /*
9473 *Non-nil enables character translation while encoding and decoding.  */);
9474   Venable_character_translation = Qt;
9475
9476   DEFVAR_LISP ("standard-translation-table-for-decode",
9477                &Vstandard_translation_table_for_decode,
9478                doc: /* Table for translating characters while decoding.  */);
9479   Vstandard_translation_table_for_decode = Qnil;
9480
9481   DEFVAR_LISP ("standard-translation-table-for-encode",
9482                &Vstandard_translation_table_for_encode,
9483                doc: /* Table for translating characters while encoding.  */);
9484   Vstandard_translation_table_for_encode = Qnil;
9485
9486   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9487                doc: /* Alist of charsets vs revision numbers.
9488 While encoding, if a charset (car part of an element) is found,
9489 designate it with the escape sequence identifying revision (cdr part
9490 of the element).  */);
9491   Vcharset_revision_table = Qnil;
9492
9493   DEFVAR_LISP ("default-process-coding-system",
9494                &Vdefault_process_coding_system,
9495                doc: /* Cons of coding systems used for process I/O by default.
9496 The car part is used for decoding a process output,
9497 the cdr part is used for encoding a text to be sent to a process.  */);
9498   Vdefault_process_coding_system = Qnil;
9499
9500   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9501                doc: /*
9502 Table of extra Latin codes in the range 128..159 (inclusive).
9503 This is a vector of length 256.
9504 If Nth element is non-nil, the existence of code N in a file
9505 \(or output of subprocess) doesn't prevent it to be detected as
9506 a coding system of ISO 2022 variant which has a flag
9507 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9508 or reading output of a subprocess.
9509 Only 128th through 159th elements has a meaning.  */);
9510   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9511
9512   DEFVAR_LISP ("select-safe-coding-system-function",
9513                &Vselect_safe_coding_system_function,
9514                doc: /*
9515 Function to call to select safe coding system for encoding a text.
9516
9517 If set, this function is called to force a user to select a proper
9518 coding system which can encode the text in the case that a default
9519 coding system used in each operation can't encode the text.
9520
9521 The default value is `select-safe-coding-system' (which see).  */);
9522   Vselect_safe_coding_system_function = Qnil;
9523
9524   DEFVAR_BOOL ("coding-system-require-warning",
9525                &coding_system_require_warning,
9526                doc: /* Internal use only.
9527 If non-nil, on writing a file, `select-safe-coding-system-function' is
9528 called even if `coding-system-for-write' is non-nil.  The command
9529 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9530   coding_system_require_warning = 0;
9531
9532
9533   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9534                &inhibit_iso_escape_detection,
9535                doc: /*
9536 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9537
9538 By default, on reading a file, Emacs tries to detect how the text is
9539 encoded.  This code detection is sensitive to escape sequences.  If
9540 the sequence is valid as ISO2022, the code is determined as one of
9541 the ISO2022 encodings, and the file is decoded by the corresponding
9542 coding system (e.g. `iso-2022-7bit').
9543
9544 However, there may be a case that you want to read escape sequences in
9545 a file as is.  In such a case, you can set this variable to non-nil.
9546 Then, as the code detection ignores any escape sequences, no file is
9547 detected as encoded in some ISO2022 encoding.  The result is that all
9548 escape sequences become visible in a buffer.
9549
9550 The default value is nil, and it is strongly recommended not to change
9551 it.  That is because many Emacs Lisp source files that contain
9552 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9553 in Emacs's distribution, and they won't be decoded correctly on
9554 reading if you suppress escape sequence detection.
9555
9556 The other way to read escape sequences in a file without decoding is
9557 to explicitly specify some coding system that doesn't use ISO2022's
9558 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9559   inhibit_iso_escape_detection = 0;
9560
9561   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9562                doc: /* Char table for translating self-inserting characters.
9563 This is applied to the result of input methods, not their input.  See also
9564 `keyboard-translate-table'.  */);
9565     Vtranslation_table_for_input = Qnil;
9566
9567   {
9568     Lisp_Object args[coding_arg_max];
9569     Lisp_Object plist[16];
9570     int i;
9571
9572     for (i = 0; i < coding_arg_max; i++)
9573       args[i] = Qnil;
9574
9575     plist[0] = intern (":name");
9576     plist[1] = args[coding_arg_name] = Qno_conversion;
9577     plist[2] = intern (":mnemonic");
9578     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9579     plist[4] = intern (":coding-type");
9580     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9581     plist[6] = intern (":ascii-compatible-p");
9582     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9583     plist[8] = intern (":default-char");
9584     plist[9] = args[coding_arg_default_char] = make_number (0);
9585     plist[10] = intern (":for-unibyte");
9586     plist[11] = args[coding_arg_for_unibyte] = Qt;
9587     plist[12] = intern (":docstring");
9588     plist[13] = build_string ("Do no conversion.\n\
9589 \n\
9590 When you visit a file with this coding, the file is read into a\n\
9591 unibyte buffer as is, thus each byte of a file is treated as a\n\
9592 character.");
9593     plist[14] = intern (":eol-type");
9594     plist[15] = args[coding_arg_eol_type] = Qunix;
9595     args[coding_arg_plist] = Flist (16, plist);
9596     Fdefine_coding_system_internal (coding_arg_max, args);
9597   }
9598
9599   setup_coding_system (Qno_conversion, &keyboard_coding);
9600   setup_coding_system (Qno_conversion, &terminal_coding);
9601   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9602
9603   {
9604     int i;
9605
9606     for (i = 0; i < coding_category_max; i++)
9607       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9608   }
9609 }
9610
9611 char *
9612 emacs_strerror (error_number)
9613      int error_number;
9614 {
9615   char *str;
9616
9617   synchronize_system_messages_locale ();
9618   str = strerror (error_number);
9619
9620   if (! NILP (Vlocale_coding_system))
9621     {
9622       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9623                                                       Vlocale_coding_system,
9624                                                       0);
9625       str = (char *) SDATA (dec);
9626     }
9627
9628   return str;
9629 }
9630
9631 #endif /* emacs */
9632
9633 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9634    (do not change this comment) */