src/coding.c

   1 /* Coding system handler (conversion, detection, etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002 Free Software Foundation, Inc.
   5    Copyright (C) 2003
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used for Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX, and update the members of
 148   DETECT_INFO.
 149
 150   Return 1 if the byte sequence conforms to XXX, otherwise return 0.
 151
 152   Below is the template of these functions.  */
 153
 154 #if 0
 155 static int
 156 detect_coding_XXX (coding, detect_info)
 157      struct coding_system *coding;
 158      struct coding_detection_info *detect_info;
 159 {
 160   unsigned char *src = coding->source;
 161   unsigned char *src_end = coding->source + coding->src_bytes;
 162   int multibytep = coding->src_multibyte;
 163   int consumed_chars = 0;
 164   int found = 0;
 165   ...;
 166
 167   while (1)
 168     {
 169       /* Get one byte from the source.  If the souce is exausted, jump
 170          to no_more_source:.  */
 171       ONE_MORE_BYTE (c);
 172
 173       if (! __C_conforms_to_XXX___ (c))
 174         break;
 175       if (! __C_strongly_suggests_XXX__ (c))
 176         found = CATEGORY_MASK_XXX;
 177     }
 178   /* The byte sequence is invalid for XXX.  */
 179   detect_info->rejected |= CATEGORY_MASK_XXX;
 180   return 0;
 181
 182  no_more_source:
 183   /* The source exausted successfully.  */
 184   detect_info->found |= found;
 185   return 1;
 186 }
 187 #endif
 188
 189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 190
 191   These functions decode a byte sequence specified as a source by
 192   CODING.  The resulting multibyte text goes to a place pointed to by
 193   CODING->charbuf, the length of which should not exceed
 194   CODING->charbuf_size;
 195
 196   These functions set the information of original and decoded texts in
 197   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 198   They also set CODING->result to one of CODING_RESULT_XXX indicating
 199   how the decoding is finished.
 200
 201   Below is the template of these functions.  */
 202
 203 #if 0
 204 static void
 205 decode_coding_XXXX (coding)
 206      struct coding_system *coding;
 207 {
 208   unsigned char *src = coding->source + coding->consumed;
 209   unsigned char *src_end = coding->source + coding->src_bytes;
 210   /* SRC_BASE remembers the start position in source in each loop.
 211      The loop will be exited when there's not enough source code, or
 212      when there's no room in CHARBUF for a decoded character.  */
 213   unsigned char *src_base;
 214   /* A buffer to produce decoded characters.  */
 215   int *charbuf = coding->charbuf + coding->charbuf_used;
 216   int *charbuf_end = coding->charbuf + coding->charbuf_size;
 217   int multibytep = coding->src_multibyte;
 218
 219   while (1)
 220     {
 221       src_base = src;
 222       if (charbuf < charbuf_end)
 223         /* No more room to produce a decoded character.  */
 224         break;
 225       ONE_MORE_BYTE (c);
 226       /* Decode it. */
 227     }
 228
 229  no_more_source:
 230   if (src_base < src_end
 231       && coding->mode & CODING_MODE_LAST_BLOCK)
 232     /* If the source ends by partial bytes to construct a character,
 233        treat them as eight-bit raw data.  */
 234     while (src_base < src_end && charbuf < charbuf_end)
 235       *charbuf++ = *src_base++;
 236   /* Remember how many bytes and characters we consumed.  If the
 237      source is multibyte, the bytes and chars are not identical.  */
 238   coding->consumed = coding->consumed_char = src_base - coding->source;
 239   /* Remember how many characters we produced.  */
 240   coding->charbuf_used = charbuf - coding->charbuf;
 241 }
 242 #endif
 243
 244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 245
 246   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 247   internal multibyte format by CODING.  The resulting byte sequence
 248   goes to a place pointed to by DESTINATION, the length of which
 249   should not exceed DST_BYTES.
 250
 251   These functions set the information of original and encoded texts in
 252   the members produced, produced_char, consumed, and consumed_char of
 253   the structure *CODING.  They also set the member result to one of
 254   CODING_RESULT_XXX indicating how the encoding finished.
 255
 256   DST_BYTES zero means that source area and destination area are
 257   overlapped, which means that we can produce a encoded text until it
 258   reaches at the head of not-yet-encoded source text.
 259
 260   Below is a template of these functions.  */
 261 #if 0
 262 static void
 263 encode_coding_XXX (coding)
 264      struct coding_system *coding;
 265 {
 266   int multibytep = coding->dst_multibyte;
 267   int *charbuf = coding->charbuf;
 268   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 269   unsigned char *dst = coding->destination + coding->produced;
 270   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 271   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 272   int produced_chars = 0;
 273
 274   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 275     {
 276       int c = *charbuf;
 277       /* Encode C into DST, and increment DST.  */
 278     }
 279  label_no_more_destination:
 280   /* How many chars and bytes we produced.  */
 281   coding->produced_char += produced_chars;
 282   coding->produced = dst - coding->destination;
 283 }
 284 #endif
 285
 286 \f
 287 /*** 1. Preamble ***/
 288
 289 #include <config.h>
 290 #include <stdio.h>
 291
 292 #include "lisp.h"
 293 #include "buffer.h"
 294 #include "character.h"
 295 #include "charset.h"
 296 #include "ccl.h"
 297 #include "composite.h"
 298 #include "coding.h"
 299 #include "window.h"
 300
 301 Lisp_Object Vcoding_system_hash_table;
 302
 303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 304 Lisp_Object Qunix, Qdos;
 305 extern Lisp_Object Qmac;        /* frame.c */
 306 Lisp_Object Qbuffer_file_coding_system;
 307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 308 Lisp_Object Qdefault_char;
 309 Lisp_Object Qno_conversion, Qundecided;
 310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 311 Lisp_Object Qbig, Qlittle;
 312 Lisp_Object Qcoding_system_history;
 313 Lisp_Object Qvalid_codes;
 314 Lisp_Object QCcategory, QCmnemonic, QCdefalut_char;
 315 Lisp_Object QCdecode_translation_table, QCencode_translation_table;
 316 Lisp_Object QCpost_read_conversion, QCpre_write_conversion;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Qinsufficient_source, Qinconsistent_eol, Qinvalid_source;
 324 Lisp_Object Qinterrupted, Qinsufficient_memory;
 325
 326 int coding_system_require_warning;
 327
 328 Lisp_Object Vselect_safe_coding_system_function;
 329
 330 /* Mnemonic string for each format of end-of-line.  */
 331 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 332 /* Mnemonic string to indicate format of end-of-line is not yet
 333    decided.  */
 334 Lisp_Object eol_mnemonic_undecided;
 335
 336 #ifdef emacs
 337
 338 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 339
 340 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 341
 342 /* Coding system emacs-mule and raw-text are for converting only
 343    end-of-line format.  */
 344 Lisp_Object Qemacs_mule, Qraw_text;
 345 Lisp_Object Qutf_8_emacs;
 346
 347 /* Coding-systems are handed between Emacs Lisp programs and C internal
 348    routines by the following three variables.  */
 349 /* Coding-system for reading files and receiving data from process.  */
 350 Lisp_Object Vcoding_system_for_read;
 351 /* Coding-system for writing files and sending data to process.  */
 352 Lisp_Object Vcoding_system_for_write;
 353 /* Coding-system actually used in the latest I/O.  */
 354 Lisp_Object Vlast_coding_system_used;
 355 /* Set to non-nil when an error is detected while code conversion.  */
 356 Lisp_Object Vlast_code_conversion_error;
 357 /* A vector of length 256 which contains information about special
 358    Latin codes (especially for dealing with Microsoft codes).  */
 359 Lisp_Object Vlatin_extra_code_table;
 360
 361 /* Flag to inhibit code conversion of end-of-line format.  */
 362 int inhibit_eol_conversion;
 363
 364 /* Flag to inhibit ISO2022 escape sequence detection.  */
 365 int inhibit_iso_escape_detection;
 366
 367 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 368 int inherit_process_coding_system;
 369
 370 /* Coding system to be used to encode text for terminal display.  */
 371 struct coding_system terminal_coding;
 372
 373 /* Coding system to be used to encode text for terminal display when
 374    terminal coding system is nil.  */
 375 struct coding_system safe_terminal_coding;
 376
 377 /* Coding system of what is sent from terminal keyboard.  */
 378 struct coding_system keyboard_coding;
 379
 380 Lisp_Object Vfile_coding_system_alist;
 381 Lisp_Object Vprocess_coding_system_alist;
 382 Lisp_Object Vnetwork_coding_system_alist;
 383
 384 Lisp_Object Vlocale_coding_system;
 385
 386 #endif /* emacs */
 387
 388 /* Flag to tell if we look up translation table on character code
 389    conversion.  */
 390 Lisp_Object Venable_character_translation;
 391 /* Standard translation table to look up on decoding (reading).  */
 392 Lisp_Object Vstandard_translation_table_for_decode;
 393 /* Standard translation table to look up on encoding (writing).  */
 394 Lisp_Object Vstandard_translation_table_for_encode;
 395
 396 Lisp_Object Qtranslation_table;
 397 Lisp_Object Qtranslation_table_id;
 398 Lisp_Object Qtranslation_table_for_decode;
 399 Lisp_Object Qtranslation_table_for_encode;
 400
 401 /* Alist of charsets vs revision number.  */
 402 static Lisp_Object Vcharset_revision_table;
 403
 404 /* Default coding systems used for process I/O.  */
 405 Lisp_Object Vdefault_process_coding_system;
 406
 407 /* Char table for translating Quail and self-inserting input.  */
 408 Lisp_Object Vtranslation_table_for_input;
 409
 410 /* Two special coding systems.  */
 411 Lisp_Object Vsjis_coding_system;
 412 Lisp_Object Vbig5_coding_system;
 413
 414 /* ISO2022 section */
 415
 416 #define CODING_ISO_INITIAL(coding, reg)                 \
 417   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 418                      coding_attr_iso_initial),          \
 419                reg)))
 420
 421
 422 #define CODING_ISO_REQUEST(coding, charset_id)  \
 423   ((charset_id <= (coding)->max_charset_id      \
 424     ? (coding)->safe_charsets[charset_id]       \
 425     : -1))
 426
 427
 428 #define CODING_ISO_FLAGS(coding)        \
 429   ((coding)->spec.iso_2022.flags)
 430 #define CODING_ISO_DESIGNATION(coding, reg)     \
 431   ((coding)->spec.iso_2022.current_designation[reg])
 432 #define CODING_ISO_INVOCATION(coding, plane)    \
 433   ((coding)->spec.iso_2022.current_invocation[plane])
 434 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 435   ((coding)->spec.iso_2022.single_shifting)
 436 #define CODING_ISO_BOL(coding)  \
 437   ((coding)->spec.iso_2022.bol)
 438 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 439   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 440
 441 /* Control characters of ISO2022.  */
 442                         /* code */      /* function */
 443 #define ISO_CODE_LF     0x0A            /* line-feed */
 444 #define ISO_CODE_CR     0x0D            /* carriage-return */
 445 #define ISO_CODE_SO     0x0E            /* shift-out */
 446 #define ISO_CODE_SI     0x0F            /* shift-in */
 447 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 448 #define ISO_CODE_ESC    0x1B            /* escape */
 449 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 450 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 451 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 452
 453 /* All code (1-byte) of ISO2022 is classified into one of the
 454    followings.  */
 455 enum iso_code_class_type
 456   {
 457     ISO_control_0,              /* Control codes in the range
 458                                    0x00..0x1F and 0x7F, except for the
 459                                    following 5 codes.  */
 460     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 461     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 462     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 463     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 464     ISO_control_1,              /* Control codes in the range
 465                                    0x80..0x9F, except for the
 466                                    following 3 codes.  */
 467     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 468     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 469     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 470     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 471     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 472     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 473     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 474   };
 475
 476 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 477     `iso-flags' attribute of an iso2022 coding system.  */
 478
 479 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 480    instead of the correct short-form sequence (e.g. ESC $ A).  */
 481 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 482
 483 /* If set, reset graphic planes and registers at end-of-line to the
 484    initial state.  */
 485 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 486
 487 /* If set, reset graphic planes and registers before any control
 488    characters to the initial state.  */
 489 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 490
 491 /* If set, encode by 7-bit environment.  */
 492 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 493
 494 /* If set, use locking-shift function.  */
 495 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 496
 497 /* If set, use single-shift function.  Overwrite
 498    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 499 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 500
 501 /* If set, use designation escape sequence.  */
 502 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 503
 504 /* If set, produce revision number sequence.  */
 505 #define CODING_ISO_FLAG_REVISION        0x0080
 506
 507 /* If set, produce ISO6429's direction specifying sequence.  */
 508 #define CODING_ISO_FLAG_DIRECTION       0x0100
 509
 510 /* If set, assume designation states are reset at beginning of line on
 511    output.  */
 512 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 513
 514 /* If set, designation sequence should be placed at beginning of line
 515    on output.  */
 516 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 517
 518 /* If set, do not encode unsafe charactes on output.  */
 519 #define CODING_ISO_FLAG_SAFE            0x0800
 520
 521 /* If set, extra latin codes (128..159) are accepted as a valid code
 522    on input.  */
 523 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 524
 525 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 526
 527 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 528
 529 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 530
 531 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 532
 533 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 534
 535 /* A character to be produced on output if encoding of the original
 536    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 537 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 538
 539
 540 /* UTF-16 section */
 541 #define CODING_UTF_16_BOM(coding)       \
 542   ((coding)->spec.utf_16.bom)
 543
 544 #define CODING_UTF_16_ENDIAN(coding)    \
 545   ((coding)->spec.utf_16.endian)
 546
 547 #define CODING_UTF_16_SURROGATE(coding) \
 548   ((coding)->spec.utf_16.surrogate)
 549
 550
 551 /* CCL section */
 552 #define CODING_CCL_DECODER(coding)      \
 553   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 554 #define CODING_CCL_ENCODER(coding)      \
 555   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 556 #define CODING_CCL_VALIDS(coding)                                          \
 557   (SDATA (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)))
 558
 559 /* Index for each coding category in `coding_categories' */
 560
 561 enum coding_category
 562   {
 563     coding_category_iso_7,
 564     coding_category_iso_7_tight,
 565     coding_category_iso_8_1,
 566     coding_category_iso_8_2,
 567     coding_category_iso_7_else,
 568     coding_category_iso_8_else,
 569     coding_category_utf_8,
 570     coding_category_utf_16_auto,
 571     coding_category_utf_16_be,
 572     coding_category_utf_16_le,
 573     coding_category_utf_16_be_nosig,
 574     coding_category_utf_16_le_nosig,
 575     coding_category_charset,
 576     coding_category_sjis,
 577     coding_category_big5,
 578     coding_category_ccl,
 579     coding_category_emacs_mule,
 580     /* All above are targets of code detection.  */
 581     coding_category_raw_text,
 582     coding_category_undecided,
 583     coding_category_max
 584   };
 585
 586 /* Definitions of flag bits used in detect_coding_XXXX.  */
 587 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 588 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 589 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 590 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 591 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 592 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 593 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 594 #define CATEGORY_MASK_UTF_16_AUTO       (1 << coding_category_utf_16_auto)
 595 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 596 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 597 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 598 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 599 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 600 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 601 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 602 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 603 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 604 #define CATEGORY_MASK_RAW_TEXT          (1 << coding_category_raw_text)
 605
 606 /* This value is returned if detect_coding_mask () find nothing other
 607    than ASCII characters.  */
 608 #define CATEGORY_MASK_ANY               \
 609   (CATEGORY_MASK_ISO_7                  \
 610    | CATEGORY_MASK_ISO_7_TIGHT          \
 611    | CATEGORY_MASK_ISO_8_1              \
 612    | CATEGORY_MASK_ISO_8_2              \
 613    | CATEGORY_MASK_ISO_7_ELSE           \
 614    | CATEGORY_MASK_ISO_8_ELSE           \
 615    | CATEGORY_MASK_UTF_8                \
 616    | CATEGORY_MASK_UTF_16_BE            \
 617    | CATEGORY_MASK_UTF_16_LE            \
 618    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 619    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 620    | CATEGORY_MASK_CHARSET              \
 621    | CATEGORY_MASK_SJIS                 \
 622    | CATEGORY_MASK_BIG5                 \
 623    | CATEGORY_MASK_CCL                  \
 624    | CATEGORY_MASK_EMACS_MULE)
 625
 626
 627 #define CATEGORY_MASK_ISO_7BIT \
 628   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 629
 630 #define CATEGORY_MASK_ISO_8BIT \
 631   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 632
 633 #define CATEGORY_MASK_ISO_ELSE \
 634   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 635
 636 #define CATEGORY_MASK_ISO_ESCAPE        \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_7_ELSE           \
 640    | CATEGORY_MASK_ISO_8_ELSE)
 641
 642 #define CATEGORY_MASK_ISO       \
 643   (  CATEGORY_MASK_ISO_7BIT     \
 644      | CATEGORY_MASK_ISO_8BIT   \
 645      | CATEGORY_MASK_ISO_ELSE)
 646
 647 #define CATEGORY_MASK_UTF_16            \
 648   (CATEGORY_MASK_UTF_16_BE              \
 649    | CATEGORY_MASK_UTF_16_LE            \
 650    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 651    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 652
 653
 654 /* List of symbols `coding-category-xxx' ordered by priority.  This
 655    variable is exposed to Emacs Lisp.  */
 656 static Lisp_Object Vcoding_category_list;
 657
 658 /* Table of coding categories (Lisp symbols).  This variable is for
 659    internal use oly.  */
 660 static Lisp_Object Vcoding_category_table;
 661
 662 /* Table of coding-categories ordered by priority.  */
 663 static enum coding_category coding_priorities[coding_category_max];
 664
 665 /* Nth element is a coding context for the coding system bound to the
 666    Nth coding category.  */
 667 static struct coding_system coding_categories[coding_category_max];
 668
 669 /*** Commonly used macros and functions ***/
 670
 671 #ifndef min
 672 #define min(a, b) ((a) < (b) ? (a) : (b))
 673 #endif
 674 #ifndef max
 675 #define max(a, b) ((a) > (b) ? (a) : (b))
 676 #endif
 677
 678 #define CODING_GET_INFO(coding, attrs, charset_list)    \
 679   do {                                                  \
 680     (attrs) = CODING_ID_ATTRS ((coding)->id);           \
 681     (charset_list) = CODING_ATTR_CHARSET_LIST (attrs);  \
 682   } while (0)
 683
 684
 685 /* Safely get one byte from the source text pointed by SRC which ends
 686    at SRC_END, and set C to that byte.  If there are not enough bytes
 687    in the source, it jumps to `no_more_source'.  If multibytep is
 688    nonzero, and a multibyte character is found at SRC, set C to the
 689    negative value of the character code.  The caller should declare
 690    and set these variables appropriately in advance:
 691         src, src_end, multibytep */
 692
 693 #define ONE_MORE_BYTE(c)                                \
 694   do {                                                  \
 695     if (src == src_end)                                 \
 696       {                                                 \
 697         if (src_base < src)                             \
 698           record_conversion_result                      \
 699             (coding, CODING_RESULT_INSUFFICIENT_SRC);   \
 700         goto no_more_source;                            \
 701       }                                                 \
 702     c = *src++;                                         \
 703     if (multibytep && (c & 0x80))                       \
 704       {                                                 \
 705         if ((c & 0xFE) == 0xC0)                         \
 706           c = ((c & 1) << 6) | *src++;                  \
 707         else                                            \
 708           {                                             \
 709             c = - string_char (--src, &src, NULL);      \
 710             record_conversion_result                    \
 711               (coding, CODING_RESULT_INVALID_SRC);      \
 712           }                                             \
 713       }                                                 \
 714     consumed_chars++;                                   \
 715   } while (0)
 716
 717
 718 #define ONE_MORE_BYTE_NO_CHECK(c)                       \
 719   do {                                                  \
 720     c = *src++;                                         \
 721     if (multibytep && (c & 0x80))                       \
 722       {                                                 \
 723         if ((c & 0xFE) == 0xC0)                         \
 724           c = ((c & 1) << 6) | *src++;                  \
 725         else                                            \
 726           {                                             \
 727             c = - string_char (--src, &src, NULL);      \
 728             record_conversion_result                    \
 729               (coding, CODING_RESULT_INVALID_SRC);      \
 730           }                                             \
 731       }                                                 \
 732     consumed_chars++;                                   \
 733   } while (0)
 734
 735
 736 /* Store a byte C in the place pointed by DST and increment DST to the
 737    next free point, and increment PRODUCED_CHARS.  The caller should
 738    assure that C is 0..127, and declare and set the variable `dst'
 739    appropriately in advance.
 740 */
 741
 742
 743 #define EMIT_ONE_ASCII_BYTE(c)  \
 744   do {                          \
 745     produced_chars++;           \
 746     *dst++ = (c);               \
 747   } while (0)
 748
 749
 750 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 751
 752 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 753   do {                                  \
 754     produced_chars += 2;                \
 755     *dst++ = (c1), *dst++ = (c2);       \
 756   } while (0)
 757
 758
 759 /* Store a byte C in the place pointed by DST and increment DST to the
 760    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 761    nonzero, store in an appropriate multibyte from.  The caller should
 762    declare and set the variables `dst' and `multibytep' appropriately
 763    in advance.  */
 764
 765 #define EMIT_ONE_BYTE(c)                \
 766   do {                                  \
 767     produced_chars++;                   \
 768     if (multibytep)                     \
 769       {                                 \
 770         int ch = (c);                   \
 771         if (ch >= 0x80)                 \
 772           ch = BYTE8_TO_CHAR (ch);      \
 773         CHAR_STRING_ADVANCE (ch, dst);  \
 774       }                                 \
 775     else                                \
 776       *dst++ = (c);                     \
 777   } while (0)
 778
 779
 780 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 781
 782 #define EMIT_TWO_BYTES(c1, c2)          \
 783   do {                                  \
 784     produced_chars += 2;                \
 785     if (multibytep)                     \
 786       {                                 \
 787         int ch;                         \
 788                                         \
 789         ch = (c1);                      \
 790         if (ch >= 0x80)                 \
 791           ch = BYTE8_TO_CHAR (ch);      \
 792         CHAR_STRING_ADVANCE (ch, dst);  \
 793         ch = (c2);                      \
 794         if (ch >= 0x80)                 \
 795           ch = BYTE8_TO_CHAR (ch);      \
 796         CHAR_STRING_ADVANCE (ch, dst);  \
 797       }                                 \
 798     else                                \
 799       {                                 \
 800         *dst++ = (c1);                  \
 801         *dst++ = (c2);                  \
 802       }                                 \
 803   } while (0)
 804
 805
 806 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 807   do {                                  \
 808     EMIT_ONE_BYTE (c1);                 \
 809     EMIT_TWO_BYTES (c2, c3);            \
 810   } while (0)
 811
 812
 813 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 814   do {                                          \
 815     EMIT_TWO_BYTES (c1, c2);                    \
 816     EMIT_TWO_BYTES (c3, c4);                    \
 817   } while (0)
 818
 819
 820 /* Prototypes for static functions.  */
 821 static void record_conversion_result P_ ((struct coding_system *coding,
 822                                           enum coding_result_code result));
 823 static int detect_coding_utf_8 P_ ((struct coding_system *,
 824                                     struct coding_detection_info *info));
 825 static void decode_coding_utf_8 P_ ((struct coding_system *));
 826 static int encode_coding_utf_8 P_ ((struct coding_system *));
 827
 828 static int detect_coding_utf_16 P_ ((struct coding_system *,
 829                                      struct coding_detection_info *info));
 830 static void decode_coding_utf_16 P_ ((struct coding_system *));
 831 static int encode_coding_utf_16 P_ ((struct coding_system *));
 832
 833 static int detect_coding_iso_2022 P_ ((struct coding_system *,
 834                                        struct coding_detection_info *info));
 835 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 836 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 837
 838 static int detect_coding_emacs_mule P_ ((struct coding_system *,
 839                                          struct coding_detection_info *info));
 840 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 841 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 842
 843 static int detect_coding_sjis P_ ((struct coding_system *,
 844                                    struct coding_detection_info *info));
 845 static void decode_coding_sjis P_ ((struct coding_system *));
 846 static int encode_coding_sjis P_ ((struct coding_system *));
 847
 848 static int detect_coding_big5 P_ ((struct coding_system *,
 849                                    struct coding_detection_info *info));
 850 static void decode_coding_big5 P_ ((struct coding_system *));
 851 static int encode_coding_big5 P_ ((struct coding_system *));
 852
 853 static int detect_coding_ccl P_ ((struct coding_system *,
 854                                   struct coding_detection_info *info));
 855 static void decode_coding_ccl P_ ((struct coding_system *));
 856 static int encode_coding_ccl P_ ((struct coding_system *));
 857
 858 static void decode_coding_raw_text P_ ((struct coding_system *));
 859 static int encode_coding_raw_text P_ ((struct coding_system *));
 860
 861 static void coding_set_source P_ ((struct coding_system *));
 862 static void coding_set_destination P_ ((struct coding_system *));
 863 static void coding_alloc_by_realloc P_ ((struct coding_system *, EMACS_INT));
 864 static void coding_alloc_by_making_gap P_ ((struct coding_system *,
 865                                             EMACS_INT));
 866 static unsigned char *alloc_destination P_ ((struct coding_system *,
 867                                              EMACS_INT, unsigned char *));
 868 static void setup_iso_safe_charsets P_ ((Lisp_Object));
 869 static unsigned char *encode_designation_at_bol P_ ((struct coding_system *,
 870                                                      int *, int *,
 871                                                      unsigned char *));
 872 static int detect_eol P_ ((const unsigned char *,
 873                            EMACS_INT, enum coding_category));
 874 static Lisp_Object adjust_coding_eol_type P_ ((struct coding_system *, int));
 875 static void decode_eol P_ ((struct coding_system *));
 876 static Lisp_Object get_translation_table P_ ((Lisp_Object, int, int *));
 877 static Lisp_Object get_translation P_ ((Lisp_Object, int *, int *,
 878                                         int, int *, int *));
 879 static int produce_chars P_ ((struct coding_system *, Lisp_Object, int));
 880 static INLINE void produce_composition P_ ((struct coding_system *, int *,
 881                                             EMACS_INT));
 882 static INLINE void produce_charset P_ ((struct coding_system *, int *,
 883                                         EMACS_INT));
 884 static void produce_annotation P_ ((struct coding_system *, EMACS_INT));
 885 static int decode_coding P_ ((struct coding_system *));
 886 static INLINE int *handle_composition_annotation P_ ((EMACS_INT, EMACS_INT,
 887                                                       struct coding_system *,
 888                                                       int *, EMACS_INT *));
 889 static INLINE int *handle_charset_annotation P_ ((EMACS_INT, EMACS_INT,
 890                                                   struct coding_system *,
 891                                                   int *, EMACS_INT *));
 892 static void consume_chars P_ ((struct coding_system *, Lisp_Object, int));
 893 static int encode_coding P_ ((struct coding_system *));
 894 static Lisp_Object make_conversion_work_buffer P_ ((int));
 895 static Lisp_Object code_conversion_restore P_ ((Lisp_Object));
 896 static INLINE int char_encodable_p P_ ((int, Lisp_Object));
 897 static Lisp_Object make_subsidiaries P_ ((Lisp_Object));
 898
 899 static void
 900 record_conversion_result (struct coding_system *coding,
 901                           enum coding_result_code result)
 902 {
 903   coding->result = result;
 904   switch (result)
 905     {
 906     case CODING_RESULT_INSUFFICIENT_SRC:
 907       Vlast_code_conversion_error = Qinsufficient_source;
 908       break;
 909     case CODING_RESULT_INCONSISTENT_EOL:
 910       Vlast_code_conversion_error = Qinconsistent_eol;
 911       break;
 912     case CODING_RESULT_INVALID_SRC:
 913       Vlast_code_conversion_error = Qinvalid_source;
 914       break;
 915     case CODING_RESULT_INTERRUPT:
 916       Vlast_code_conversion_error = Qinterrupted;
 917       break;
 918     case CODING_RESULT_INSUFFICIENT_MEM:
 919       Vlast_code_conversion_error = Qinsufficient_memory;
 920       break;
 921     }
 922 }
 923
 924 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 925   do {                                                                       \
 926     charset_map_loaded = 0;                                                  \
 927     c = DECODE_CHAR (charset, code);                                         \
 928     if (charset_map_loaded)                                                  \
 929       {                                                                      \
 930         const unsigned char *orig = coding->source;                          \
 931         EMACS_INT offset;                                                    \
 932                                                                              \
 933         coding_set_source (coding);                                          \
 934         offset = coding->source - orig;                                      \
 935         src += offset;                                                       \
 936         src_base += offset;                                                  \
 937         src_end += offset;                                                   \
 938       }                                                                      \
 939   } while (0)
 940
 941
 942 #define ASSURE_DESTINATION(bytes)                               \
 943   do {                                                          \
 944     if (dst + (bytes) >= dst_end)                               \
 945       {                                                         \
 946         int more_bytes = charbuf_end - charbuf + (bytes);       \
 947                                                                 \
 948         dst = alloc_destination (coding, more_bytes, dst);      \
 949         dst_end = coding->destination + coding->dst_bytes;      \
 950       }                                                         \
 951   } while (0)
 952
 953
 954
 955 static void
 956 coding_set_source (coding)
 957      struct coding_system *coding;
 958 {
 959   if (BUFFERP (coding->src_object))
 960     {
 961       struct buffer *buf = XBUFFER (coding->src_object);
 962
 963       if (coding->src_pos < 0)
 964         coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
 965       else
 966         coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
 967     }
 968   else if (STRINGP (coding->src_object))
 969     {
 970       coding->source = SDATA (coding->src_object) + coding->src_pos_byte;
 971     }
 972   else
 973     /* Otherwise, the source is C string and is never relocated
 974        automatically.  Thus we don't have to update anything.  */
 975     ;
 976 }
 977
 978 static void
 979 coding_set_destination (coding)
 980      struct coding_system *coding;
 981 {
 982   if (BUFFERP (coding->dst_object))
 983     {
 984       if (coding->src_pos < 0)
 985         {
 986           coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
 987           coding->dst_bytes = (GAP_END_ADDR
 988                                - (coding->src_bytes - coding->consumed)
 989                                - coding->destination);
 990         }
 991       else
 992         {
 993           /* We are sure that coding->dst_pos_byte is before the gap
 994              of the buffer. */
 995           coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 996                                  + coding->dst_pos_byte - 1);
 997           coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 998                                - coding->destination);
 999         }
1000     }
1001   else
1002     /* Otherwise, the destination is C string and is never relocated
1003        automatically.  Thus we don't have to update anything.  */
1004     ;
1005 }
1006
1007
1008 static void
1009 coding_alloc_by_realloc (coding, bytes)
1010      struct coding_system *coding;
1011      EMACS_INT bytes;
1012 {
1013   coding->destination = (unsigned char *) xrealloc (coding->destination,
1014                                                     coding->dst_bytes + bytes);
1015   coding->dst_bytes += bytes;
1016 }
1017
1018 static void
1019 coding_alloc_by_making_gap (coding, bytes)
1020      struct coding_system *coding;
1021      EMACS_INT bytes;
1022 {
1023   if (BUFFERP (coding->dst_object)
1024       && EQ (coding->src_object, coding->dst_object))
1025     {
1026       EMACS_INT add = coding->src_bytes - coding->consumed;
1027
1028       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
1029       make_gap (bytes);
1030       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
1031     }
1032   else
1033     {
1034       Lisp_Object this_buffer;
1035
1036       this_buffer = Fcurrent_buffer ();
1037       set_buffer_internal (XBUFFER (coding->dst_object));
1038       make_gap (bytes);
1039       set_buffer_internal (XBUFFER (this_buffer));
1040     }
1041 }
1042
1043
1044 static unsigned char *
1045 alloc_destination (coding, nbytes, dst)
1046      struct coding_system *coding;
1047      EMACS_INT nbytes;
1048      unsigned char *dst;
1049 {
1050   EMACS_INT offset = dst - coding->destination;
1051
1052   if (BUFFERP (coding->dst_object))
1053     coding_alloc_by_making_gap (coding, nbytes);
1054   else
1055     coding_alloc_by_realloc (coding, nbytes);
1056   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1057   coding_set_destination (coding);
1058   dst = coding->destination + offset;
1059   return dst;
1060 }
1061
1062 /** Macros for annotations.  */
1063
1064 /* Maximum length of annotation data (sum of annotations for
1065    composition and charset).  */
1066 #define MAX_ANNOTATION_LENGTH (4 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 4)
1067
1068 /* An annotation data is stored in the array coding->charbuf in this
1069    format:
1070      [ -LENGTH ANNOTATION_MASK NCHARS ... ]
1071    LENGTH is the number of elements in the annotation.
1072    ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
1073    NCHARS is the number of characters in the text annotated.
1074
1075    The format of the following elements depend on ANNOTATION_MASK.
1076
1077    In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1078    follows:
1079      ... METHOD [ COMPOSITION-COMPONENTS ... ]
1080    METHOD is one of enum composition_method.
1081    Optionnal COMPOSITION-COMPONENTS are characters and composition
1082    rules.
1083
1084    In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1085    follows.  */
1086
1087 #define ADD_ANNOTATION_DATA(buf, len, mask, nchars)     \
1088   do {                                                  \
1089     *(buf)++ = -(len);                                  \
1090     *(buf)++ = (mask);                                  \
1091     *(buf)++ = (nchars);                                \
1092     coding->annotated = 1;                              \
1093   } while (0);
1094
1095 #define ADD_COMPOSITION_DATA(buf, nchars, method)                           \
1096   do {                                                                      \
1097     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_COMPOSITION_MASK, nchars); \
1098     *buf++ = method;                                                        \
1099   } while (0)
1100
1101
1102 #define ADD_CHARSET_DATA(buf, nchars, id)                               \
1103   do {                                                                  \
1104     ADD_ANNOTATION_DATA (buf, 4, CODING_ANNOTATE_CHARSET_MASK, nchars); \
1105     *buf++ = id;                                                        \
1106   } while (0)
1107
1108 \f
1109 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1110
1111
1112
1113 \f
1114 /*** 3. UTF-8 ***/
1115
1116 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1117    Check if a text is encoded in UTF-8.  If it is, return 1, else
1118    return 0.  */
1119
1120 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1121 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1122 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1123 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1124 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1125 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1126
1127 static int
1128 detect_coding_utf_8 (coding, detect_info)
1129      struct coding_system *coding;
1130      struct coding_detection_info *detect_info;
1131 {
1132   const unsigned char *src = coding->source, *src_base;
1133   const unsigned char *src_end = coding->source + coding->src_bytes;
1134   int multibytep = coding->src_multibyte;
1135   int consumed_chars = 0;
1136   int found = 0;
1137
1138   detect_info->checked |= CATEGORY_MASK_UTF_8;
1139   /* A coding system of this category is always ASCII compatible.  */
1140   src += coding->head_ascii;
1141
1142   while (1)
1143     {
1144       int c, c1, c2, c3, c4;
1145
1146       src_base = src;
1147       ONE_MORE_BYTE (c);
1148       if (c < 0 || UTF_8_1_OCTET_P (c))
1149         continue;
1150       ONE_MORE_BYTE (c1);
1151       if (c1 < 0 || ! UTF_8_EXTRA_OCTET_P (c1))
1152         break;
1153       if (UTF_8_2_OCTET_LEADING_P (c))
1154         {
1155           found = CATEGORY_MASK_UTF_8;
1156           continue;
1157         }
1158       ONE_MORE_BYTE (c2);
1159       if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1160         break;
1161       if (UTF_8_3_OCTET_LEADING_P (c))
1162         {
1163           found = CATEGORY_MASK_UTF_8;
1164           continue;
1165         }
1166       ONE_MORE_BYTE (c3);
1167       if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1168         break;
1169       if (UTF_8_4_OCTET_LEADING_P (c))
1170         {
1171           found = CATEGORY_MASK_UTF_8;
1172           continue;
1173         }
1174       ONE_MORE_BYTE (c4);
1175       if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1176         break;
1177       if (UTF_8_5_OCTET_LEADING_P (c))
1178         {
1179           found = CATEGORY_MASK_UTF_8;
1180           continue;
1181         }
1182       break;
1183     }
1184   detect_info->rejected |= CATEGORY_MASK_UTF_8;
1185   return 0;
1186
1187  no_more_source:
1188   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1189     {
1190       detect_info->rejected |= CATEGORY_MASK_UTF_8;
1191       return 0;
1192     }
1193   detect_info->found |= found;
1194   return 1;
1195 }
1196
1197
1198 static void
1199 decode_coding_utf_8 (coding)
1200      struct coding_system *coding;
1201 {
1202   const unsigned char *src = coding->source + coding->consumed;
1203   const unsigned char *src_end = coding->source + coding->src_bytes;
1204   const unsigned char *src_base;
1205   int *charbuf = coding->charbuf + coding->charbuf_used;
1206   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1207   int consumed_chars = 0, consumed_chars_base;
1208   int multibytep = coding->src_multibyte;
1209   Lisp_Object attr, charset_list;
1210
1211   CODING_GET_INFO (coding, attr, charset_list);
1212
1213   while (1)
1214     {
1215       int c, c1, c2, c3, c4, c5;
1216
1217       src_base = src;
1218       consumed_chars_base = consumed_chars;
1219
1220       if (charbuf >= charbuf_end)
1221         break;
1222
1223       ONE_MORE_BYTE (c1);
1224       if (c1 < 0)
1225         {
1226           c = - c1;
1227         }
1228       else if (UTF_8_1_OCTET_P(c1))
1229         {
1230           c = c1;
1231         }
1232       else
1233         {
1234           ONE_MORE_BYTE (c2);
1235           if (c2 < 0 || ! UTF_8_EXTRA_OCTET_P (c2))
1236             goto invalid_code;
1237           if (UTF_8_2_OCTET_LEADING_P (c1))
1238             {
1239               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1240               /* Reject overlong sequences here and below.  Encoders
1241                  producing them are incorrect, they can be misleading,
1242                  and they mess up read/write invariance.  */
1243               if (c < 128)
1244                 goto invalid_code;
1245             }
1246           else
1247             {
1248               ONE_MORE_BYTE (c3);
1249               if (c3 < 0 || ! UTF_8_EXTRA_OCTET_P (c3))
1250                 goto invalid_code;
1251               if (UTF_8_3_OCTET_LEADING_P (c1))
1252                 {
1253                   c = (((c1 & 0xF) << 12)
1254                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1255                   if (c < 0x800
1256                       || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1257                     goto invalid_code;
1258                 }
1259               else
1260                 {
1261                   ONE_MORE_BYTE (c4);
1262                   if (c4 < 0 || ! UTF_8_EXTRA_OCTET_P (c4))
1263                     goto invalid_code;
1264                   if (UTF_8_4_OCTET_LEADING_P (c1))
1265                     {
1266                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1267                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1268                     if (c < 0x10000)
1269                       goto invalid_code;
1270                     }
1271                   else
1272                     {
1273                       ONE_MORE_BYTE (c5);
1274                       if (c5 < 0 || ! UTF_8_EXTRA_OCTET_P (c5))
1275                         goto invalid_code;
1276                       if (UTF_8_5_OCTET_LEADING_P (c1))
1277                         {
1278                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1279                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1280                                | (c5 & 0x3F));
1281                           if ((c > MAX_CHAR) || (c < 0x200000))
1282                             goto invalid_code;
1283                         }
1284                       else
1285                         goto invalid_code;
1286                     }
1287                 }
1288             }
1289         }
1290
1291       *charbuf++ = c;
1292       continue;
1293
1294     invalid_code:
1295       src = src_base;
1296       consumed_chars = consumed_chars_base;
1297       ONE_MORE_BYTE (c);
1298       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1299       coding->errors++;
1300     }
1301
1302  no_more_source:
1303   coding->consumed_char += consumed_chars_base;
1304   coding->consumed = src_base - coding->source;
1305   coding->charbuf_used = charbuf - coding->charbuf;
1306 }
1307
1308
1309 static int
1310 encode_coding_utf_8 (coding)
1311      struct coding_system *coding;
1312 {
1313   int multibytep = coding->dst_multibyte;
1314   int *charbuf = coding->charbuf;
1315   int *charbuf_end = charbuf + coding->charbuf_used;
1316   unsigned char *dst = coding->destination + coding->produced;
1317   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1318   int produced_chars = 0;
1319   int c;
1320
1321   if (multibytep)
1322     {
1323       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1324
1325       while (charbuf < charbuf_end)
1326         {
1327           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1328
1329           ASSURE_DESTINATION (safe_room);
1330           c = *charbuf++;
1331           if (CHAR_BYTE8_P (c))
1332             {
1333               c = CHAR_TO_BYTE8 (c);
1334               EMIT_ONE_BYTE (c);
1335             }
1336           else
1337             {
1338               CHAR_STRING_ADVANCE (c, pend);
1339               for (p = str; p < pend; p++)
1340                 EMIT_ONE_BYTE (*p);
1341             }
1342         }
1343     }
1344   else
1345     {
1346       int safe_room = MAX_MULTIBYTE_LENGTH;
1347
1348       while (charbuf < charbuf_end)
1349         {
1350           ASSURE_DESTINATION (safe_room);
1351           c = *charbuf++;
1352           if (CHAR_BYTE8_P (c))
1353             *dst++ = CHAR_TO_BYTE8 (c);
1354           else
1355             dst += CHAR_STRING (c, dst);
1356           produced_chars++;
1357         }
1358     }
1359   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1360   coding->produced_char += produced_chars;
1361   coding->produced = dst - coding->destination;
1362   return 0;
1363 }
1364
1365
1366 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1367    Check if a text is encoded in one of UTF-16 based coding systems.
1368    If it is, return 1, else return 0.  */
1369
1370 #define UTF_16_HIGH_SURROGATE_P(val) \
1371   (((val) & 0xFC00) == 0xD800)
1372
1373 #define UTF_16_LOW_SURROGATE_P(val) \
1374   (((val) & 0xFC00) == 0xDC00)
1375
1376 #define UTF_16_INVALID_P(val)   \
1377   (((val) == 0xFFFE)            \
1378    || ((val) == 0xFFFF)         \
1379    || UTF_16_LOW_SURROGATE_P (val))
1380
1381
1382 static int
1383 detect_coding_utf_16 (coding, detect_info)
1384      struct coding_system *coding;
1385      struct coding_detection_info *detect_info;
1386 {
1387   const unsigned char *src = coding->source, *src_base = src;
1388   const unsigned char *src_end = coding->source + coding->src_bytes;
1389   int multibytep = coding->src_multibyte;
1390   int consumed_chars = 0;
1391   int c1, c2;
1392
1393   detect_info->checked |= CATEGORY_MASK_UTF_16;
1394   if (coding->mode & CODING_MODE_LAST_BLOCK
1395       && (coding->src_chars & 1))
1396     {
1397       detect_info->rejected |= CATEGORY_MASK_UTF_16;
1398       return 0;
1399     }
1400
1401   ONE_MORE_BYTE (c1);
1402   ONE_MORE_BYTE (c2);
1403   if ((c1 == 0xFF) && (c2 == 0xFE))
1404     {
1405       detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1406                              | CATEGORY_MASK_UTF_16_AUTO);
1407       detect_info->rejected |= (CATEGORY_MASK_UTF_16_BE
1408                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1409                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1410     }
1411   else if ((c1 == 0xFE) && (c2 == 0xFF))
1412     {
1413       detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1414                              | CATEGORY_MASK_UTF_16_AUTO);
1415       detect_info->rejected |= (CATEGORY_MASK_UTF_16_LE
1416                                 | CATEGORY_MASK_UTF_16_BE_NOSIG
1417                                 | CATEGORY_MASK_UTF_16_LE_NOSIG);
1418     }
1419   else if (c1 >= 0 && c2 >= 0)
1420     {
1421       unsigned char b1[256], b2[256];
1422       int b1_variants = 1, b2_variants = 1;
1423       int n;
1424
1425       bzero (b1, 256), bzero (b2, 256);
1426       b1[c1]++, b2[c2]++;
1427       for (n = 0; n < 256 && src < src_end; n++)
1428         {
1429           src_base = src;
1430           ONE_MORE_BYTE (c1);
1431           ONE_MORE_BYTE (c2);
1432           if (c1 < 0 || c2 < 0)
1433             break;
1434           if (! b1[c1++]) b1_variants++;
1435           if (! b2[c2++]) b2_variants++;
1436         }
1437       if (b1_variants < b2_variants)
1438         detect_info->found |= CATEGORY_MASK_UTF_16_BE_NOSIG;
1439       else
1440         detect_info->found |= CATEGORY_MASK_UTF_16_LE_NOSIG;
1441       detect_info->rejected
1442         |= (CATEGORY_MASK_UTF_16_BE | CATEGORY_MASK_UTF_16_LE);
1443     }
1444  no_more_source:
1445   return 1;
1446 }
1447
1448 static void
1449 decode_coding_utf_16 (coding)
1450      struct coding_system *coding;
1451 {
1452   const unsigned char *src = coding->source + coding->consumed;
1453   const unsigned char *src_end = coding->source + coding->src_bytes;
1454   const unsigned char *src_base;
1455   int *charbuf = coding->charbuf + coding->charbuf_used;
1456   int *charbuf_end = coding->charbuf + coding->charbuf_size;
1457   int consumed_chars = 0, consumed_chars_base;
1458   int multibytep = coding->src_multibyte;
1459   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1460   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1461   int surrogate = CODING_UTF_16_SURROGATE (coding);
1462   Lisp_Object attr, charset_list;
1463
1464   CODING_GET_INFO (coding, attr, charset_list);
1465
1466   if (bom == utf_16_with_bom)
1467     {
1468       int c, c1, c2;
1469
1470       src_base = src;
1471       ONE_MORE_BYTE (c1);
1472       ONE_MORE_BYTE (c2);
1473       c = (c1 << 8) | c2;
1474
1475       if (endian == utf_16_big_endian
1476           ? c != 0xFEFF : c != 0xFFFE)
1477         {
1478           /* The first two bytes are not BOM.  Treat them as bytes
1479              for a normal character.  */
1480           src = src_base;
1481           coding->errors++;
1482         }
1483       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1484     }
1485   else if (bom == utf_16_detect_bom)
1486     {
1487       /* We have already tried to detect BOM and failed in
1488          detect_coding.  */
1489       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1490     }
1491
1492   while (1)
1493     {
1494       int c, c1, c2;
1495
1496       src_base = src;
1497       consumed_chars_base = consumed_chars;
1498
1499       if (charbuf + 2 >= charbuf_end)
1500         break;
1501
1502       ONE_MORE_BYTE (c1);
1503       if (c1 < 0)
1504         {
1505           *charbuf++ = -c1;
1506           continue;
1507         }
1508       ONE_MORE_BYTE (c2);
1509       if (c2 < 0)
1510         {
1511           *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
1512           *charbuf++ = -c2;
1513           continue;
1514         }
1515       c = (endian == utf_16_big_endian
1516            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1517       if (surrogate)
1518         {
1519           if (! UTF_16_LOW_SURROGATE_P (c))
1520             {
1521               if (endian == utf_16_big_endian)
1522                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1523               else
1524                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1525               *charbuf++ = c1;
1526               *charbuf++ = c2;
1527               coding->errors++;
1528               if (UTF_16_HIGH_SURROGATE_P (c))
1529                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1530               else
1531                 *charbuf++ = c;
1532             }
1533           else
1534             {
1535               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1536               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1537               *charbuf++ = 0x10000 + c;
1538             }
1539         }
1540       else
1541         {
1542           if (UTF_16_HIGH_SURROGATE_P (c))
1543             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1544           else
1545             *charbuf++ = c;
1546         }
1547     }
1548
1549  no_more_source:
1550   coding->consumed_char += consumed_chars_base;
1551   coding->consumed = src_base - coding->source;
1552   coding->charbuf_used = charbuf - coding->charbuf;
1553 }
1554
1555 static int
1556 encode_coding_utf_16 (coding)
1557      struct coding_system *coding;
1558 {
1559   int multibytep = coding->dst_multibyte;
1560   int *charbuf = coding->charbuf;
1561   int *charbuf_end = charbuf + coding->charbuf_used;
1562   unsigned char *dst = coding->destination + coding->produced;
1563   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1564   int safe_room = 8;
1565   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1566   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1567   int produced_chars = 0;
1568   Lisp_Object attrs, charset_list;
1569   int c;
1570
1571   CODING_GET_INFO (coding, attrs, charset_list);
1572
1573   if (bom != utf_16_without_bom)
1574     {
1575       ASSURE_DESTINATION (safe_room);
1576       if (big_endian)
1577         EMIT_TWO_BYTES (0xFE, 0xFF);
1578       else
1579         EMIT_TWO_BYTES (0xFF, 0xFE);
1580       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1581     }
1582
1583   while (charbuf < charbuf_end)
1584     {
1585       ASSURE_DESTINATION (safe_room);
1586       c = *charbuf++;
1587       if (c >= MAX_UNICODE_CHAR)
1588         c = coding->default_char;
1589
1590       if (c < 0x10000)
1591         {
1592           if (big_endian)
1593             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1594           else
1595             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1596         }
1597       else
1598         {
1599           int c1, c2;
1600
1601           c -= 0x10000;
1602           c1 = (c >> 10) + 0xD800;
1603           c2 = (c & 0x3FF) + 0xDC00;
1604           if (big_endian)
1605             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1606           else
1607             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1608         }
1609     }
1610   record_conversion_result (coding, CODING_RESULT_SUCCESS);
1611   coding->produced = dst - coding->destination;
1612   coding->produced_char += produced_chars;
1613   return 0;
1614 }
1615
1616 \f
1617 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1618
1619 /* Emacs' internal format for representation of multiple character
1620    sets is a kind of multi-byte encoding, i.e. characters are
1621    represented by variable-length sequences of one-byte codes.
1622
1623    ASCII characters and control characters (e.g. `tab', `newline') are
1624    represented by one-byte sequences which are their ASCII codes, in
1625    the range 0x00 through 0x7F.
1626
1627    8-bit characters of the range 0x80..0x9F are represented by
1628    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1629    code + 0x20).
1630
1631    8-bit characters of the range 0xA0..0xFF are represented by
1632    one-byte sequences which are their 8-bit code.
1633
1634    The other characters are represented by a sequence of `base
1635    leading-code', optional `extended leading-code', and one or two
1636    `position-code's.  The length of the sequence is determined by the
1637    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1638    whereas extended leading-code and position-code take the range 0xA0
1639    through 0xFF.  See `charset.h' for more details about leading-code
1640    and position-code.
1641
1642    --- CODE RANGE of Emacs' internal format ---
1643    character set        range
1644    -------------        -----
1645    ascii                0x00..0x7F
1646    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1647    eight-bit-graphic    0xA0..0xBF
1648    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1649    ---------------------------------------------
1650
1651    As this is the internal character representation, the format is
1652    usually not used externally (i.e. in a file or in a data sent to a
1653    process).  But, it is possible to have a text externally in this
1654    format (i.e. by encoding by the coding system `emacs-mule').
1655
1656    In that case, a sequence of one-byte codes has a slightly different
1657    form.
1658
1659    At first, all characters in eight-bit-control are represented by
1660    one-byte sequences which are their 8-bit code.
1661
1662    Next, character composition data are represented by the byte
1663    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1664    where,
1665         METHOD is 0xF0 plus one of composition method (enum
1666         composition_method),
1667
1668         BYTES is 0xA0 plus a byte length of this composition data,
1669
1670         CHARS is 0x20 plus a number of characters composed by this
1671         data,
1672
1673         COMPONENTs are characters of multibye form or composition
1674         rules encoded by two-byte of ASCII codes.
1675
1676    In addition, for backward compatibility, the following formats are
1677    also recognized as composition data on decoding.
1678
1679    0x80 MSEQ ...
1680    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1681
1682    Here,
1683         MSEQ is a multibyte form but in these special format:
1684           ASCII: 0xA0 ASCII_CODE+0x80,
1685           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1686         RULE is a one byte code of the range 0xA0..0xF0 that
1687         represents a composition rule.
1688   */
1689
1690 char emacs_mule_bytes[256];
1691
1692 int
1693 emacs_mule_char (coding, src, nbytes, nchars, id)
1694      struct coding_system *coding;
1695      const unsigned char *src;
1696      int *nbytes, *nchars, *id;
1697 {
1698   const unsigned char *src_end = coding->source + coding->src_bytes;
1699   const unsigned char *src_base = src;
1700   int multibytep = coding->src_multibyte;
1701   struct charset *charset;
1702   unsigned code;
1703   int c;
1704   int consumed_chars = 0;
1705
1706   ONE_MORE_BYTE (c);
1707   if (c < 0)
1708     {
1709       c = -c;
1710       charset = emacs_mule_charset[0];
1711     }
1712   else
1713     {
1714       switch (emacs_mule_bytes[c])
1715         {
1716         case 2:
1717           if (! (charset = emacs_mule_charset[c]))
1718             goto invalid_code;
1719           ONE_MORE_BYTE (c);
1720           if (c < 0xA0)
1721             goto invalid_code;
1722           code = c & 0x7F;
1723           break;
1724
1725         case 3:
1726           if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1727               || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1728             {
1729               ONE_MORE_BYTE (c);
1730               if (c < 0xA0 || ! (charset = emacs_mule_charset[c]))
1731                 goto invalid_code;
1732               ONE_MORE_BYTE (c);
1733               if (c < 0xA0)
1734                 goto invalid_code;
1735               code = c & 0x7F;
1736             }
1737           else
1738             {
1739               if (! (charset = emacs_mule_charset[c]))
1740                 goto invalid_code;
1741               ONE_MORE_BYTE (c);
1742               if (c < 0xA0)
1743                 goto invalid_code;
1744               code = (c & 0x7F) << 8;
1745               ONE_MORE_BYTE (c);
1746               if (c < 0xA0)
1747                 goto invalid_code;
1748               code |= c & 0x7F;
1749             }
1750           break;
1751
1752         case 4:
1753           ONE_MORE_BYTE (c);
1754           if (c < 0 || ! (charset = emacs_mule_charset[c]))
1755             goto invalid_code;
1756           ONE_MORE_BYTE (c);
1757           if (c < 0xA0)
1758             goto invalid_code;
1759           code = (c & 0x7F) << 8;
1760           ONE_MORE_BYTE (c);
1761           if (c < 0xA0)
1762             goto invalid_code;
1763           code |= c & 0x7F;
1764           break;
1765
1766         case 1:
1767           code = c;
1768           charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1769                                      ? charset_ascii : charset_eight_bit);
1770           break;
1771
1772         default:
1773           abort ();
1774         }
1775       c = DECODE_CHAR (charset, code);
1776       if (c < 0)
1777         goto invalid_code;
1778     }
1779   *nbytes = src - src_base;
1780   *nchars = consumed_chars;
1781   if (id)
1782     *id = charset->id;
1783   return c;
1784
1785  no_more_source:
1786   return -2;
1787
1788  invalid_code:
1789   return -1;
1790 }
1791
1792
1793 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1794    Check if a text is encoded in `emacs-mule'.  If it is, return 1,
1795    else return 0.  */
1796
1797 static int
1798 detect_coding_emacs_mule (coding, detect_info)
1799      struct coding_system *coding;
1800      struct coding_detection_info *detect_info;
1801 {
1802   const unsigned char *src = coding->source, *src_base;
1803   const unsigned char *src_end = coding->source + coding->src_bytes;
1804   int multibytep = coding->src_multibyte;
1805   int consumed_chars = 0;
1806   int c;
1807   int found = 0;
1808
1809   detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1810   /* A coding system of this category is always ASCII compatible.  */
1811   src += coding->head_ascii;
1812
1813   while (1)
1814     {
1815       src_base = src;
1816       ONE_MORE_BYTE (c);
1817       if (c < 0)
1818         continue;
1819       if (c == 0x80)
1820         {
1821           /* Perhaps the start of composite character.  We simple skip
1822              it because analyzing it is too heavy for detecting.  But,
1823              at least, we check that the composite character
1824              constitues of more than 4 bytes.  */
1825           const unsigned char *src_base;
1826
1827         repeat:
1828           src_base = src;
1829           do
1830             {
1831               ONE_MORE_BYTE (c);
1832             }
1833           while (c >= 0xA0);
1834
1835           if (src - src_base <= 4)
1836             break;
1837           found = CATEGORY_MASK_EMACS_MULE;
1838           if (c == 0x80)
1839             goto repeat;
1840         }
1841
1842       if (c < 0x80)
1843         {
1844           if (c < 0x20
1845               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1846             break;
1847         }
1848       else
1849         {
1850           int more_bytes = emacs_mule_bytes[*src_base] - 1;
1851
1852           while (more_bytes > 0)
1853             {
1854               ONE_MORE_BYTE (c);
1855               if (c < 0xA0)
1856                 {
1857                   src--;        /* Unread the last byte.  */
1858                   break;
1859                 }
1860               more_bytes--;
1861             }
1862           if (more_bytes != 0)
1863             break;
1864           found = CATEGORY_MASK_EMACS_MULE;
1865         }
1866     }
1867   detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1868   return 0;
1869
1870  no_more_source:
1871   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
1872     {
1873       detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1874       return 0;
1875     }
1876   detect_info->found |= found;
1877   return 1;
1878 }
1879
1880
1881 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1882
1883 /* Decode a character represented as a component of composition
1884    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1885    update SRC to the head of next character (or an encoded composition
1886    rule).  If SRC doesn't points a composition component, set C to -1.
1887    If SRC points an invalid byte sequence, global exit by a return
1888    value 0.  */
1889
1890 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1891   if (1)                                                        \
1892     {                                                           \
1893       int c;                                                    \
1894       int nbytes, nchars;                                       \
1895                                                                 \
1896       if (src == src_end)                                       \
1897         break;                                                  \
1898       c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1899       if (c < 0)                                                \
1900         {                                                       \
1901           if (c == -2)                                          \
1902             break;                                              \
1903           goto invalid_code;                                    \
1904         }                                                       \
1905       *buf++ = c;                                               \
1906       src += nbytes;                                            \
1907       consumed_chars += nchars;                                 \
1908     }                                                           \
1909   else
1910
1911
1912 /* Decode a composition rule represented as a component of composition
1913    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1914    and increment BUF.  If SRC points an invalid byte sequence, set C
1915    to -1.  */
1916
1917 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1918   do {                                                  \
1919     int c, gref, nref;                                  \
1920                                                         \
1921     if (src >= src_end)                                 \
1922       goto invalid_code;                                \
1923     ONE_MORE_BYTE_NO_CHECK (c);                         \
1924     c -= 0x20;                                          \
1925     if (c < 0 || c >= 81)                               \
1926       goto invalid_code;                                \
1927                                                         \
1928     gref = c / 9, nref = c % 9;                         \
1929     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1930   } while (0)
1931
1932
1933 /* Decode a composition rule represented as a component of composition
1934    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1935    and increment BUF.  If SRC points an invalid byte sequence, set C
1936    to -1.  */
1937
1938 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1939   do {                                                  \
1940     int gref, nref;                                     \
1941                                                         \
1942     if (src + 1>= src_end)                              \
1943       goto invalid_code;                                \
1944     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1945     gref -= 0x20;                                       \
1946     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1947     nref -= 0x20;                                       \
1948     if (gref < 0 || gref >= 81                          \
1949         || nref < 0 || nref >= 81)                      \
1950       goto invalid_code;                                \
1951     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1952   } while (0)
1953
1954
1955 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1956   do {                                                                  \
1957     /* Emacs 21 style format.  The first three bytes at SRC are         \
1958        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1959        the byte length of this composition information, CHARS is the    \
1960        number of characters composed by this composition.  */           \
1961     enum composition_method method = c - 0xF2;                          \
1962     int *charbuf_base = charbuf;                                        \
1963     int consumed_chars_limit;                                           \
1964     int nbytes, nchars;                                                 \
1965                                                                         \
1966     ONE_MORE_BYTE (c);                                                  \
1967     if (c < 0)                                                          \
1968       goto invalid_code;                                                \
1969     nbytes = c - 0xA0;                                                  \
1970     if (nbytes < 3)                                                     \
1971       goto invalid_code;                                                \
1972     ONE_MORE_BYTE (c);                                                  \
1973     if (c < 0)                                                          \
1974       goto invalid_code;                                                \
1975     nchars = c - 0xA0;                                                  \
1976     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
1977     consumed_chars_limit = consumed_chars_base + nbytes;                \
1978     if (method != COMPOSITION_RELATIVE)                                 \
1979       {                                                                 \
1980         int i = 0;                                                      \
1981         while (consumed_chars < consumed_chars_limit)                   \
1982           {                                                             \
1983             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1984               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1985             else                                                        \
1986               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1987             i++;                                                        \
1988           }                                                             \
1989         if (consumed_chars < consumed_chars_limit)                      \
1990           goto invalid_code;                                            \
1991         charbuf_base[0] -= i;                                           \
1992       }                                                                 \
1993   } while (0)
1994
1995
1996 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1997   do {                                                          \
1998     /* Emacs 20 style format for relative composition.  */      \
1999     /* Store multibyte form of characters to be composed.  */   \
2000     enum composition_method method = COMPOSITION_RELATIVE;      \
2001     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2002     int *buf = components;                                      \
2003     int i, j;                                                   \
2004                                                                 \
2005     src = src_base;                                             \
2006     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
2007     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2008       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
2009     if (i < 2)                                                  \
2010       goto invalid_code;                                        \
2011     ADD_COMPOSITION_DATA (charbuf, i, method);                  \
2012     for (j = 0; j < i; j++)                                     \
2013       *charbuf++ = components[j];                               \
2014   } while (0)
2015
2016
2017 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
2018   do {                                                          \
2019     /* Emacs 20 style format for rule-base composition.  */     \
2020     /* Store multibyte form of characters to be composed.  */   \
2021     enum composition_method method = COMPOSITION_WITH_RULE;     \
2022     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
2023     int *buf = components;                                      \
2024     int i, j;                                                   \
2025                                                                 \
2026     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
2027     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
2028       {                                                         \
2029         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
2030         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
2031       }                                                         \
2032     if (i < 1 || (buf - components) % 2 == 0)                   \
2033       goto invalid_code;                                        \
2034     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
2035       goto no_more_source;                                      \
2036     ADD_COMPOSITION_DATA (buf, i, method);                      \
2037     for (j = 0; j < i; j++)                                     \
2038       *charbuf++ = components[j];                               \
2039     for (j = 0; j < i; j += 2)                                  \
2040       *charbuf++ = components[j];                               \
2041   } while (0)
2042
2043
2044 static void
2045 decode_coding_emacs_mule (coding)
2046      struct coding_system *coding;
2047 {
2048   const unsigned char *src = coding->source + coding->consumed;
2049   const unsigned char *src_end = coding->source + coding->src_bytes;
2050   const unsigned char *src_base;
2051   int *charbuf = coding->charbuf + coding->charbuf_used;
2052   int *charbuf_end
2053     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
2054   int consumed_chars = 0, consumed_chars_base;
2055   int multibytep = coding->src_multibyte;
2056   Lisp_Object attrs, charset_list;
2057   int char_offset = coding->produced_char;
2058   int last_offset = char_offset;
2059   int last_id = charset_ascii;
2060
2061   CODING_GET_INFO (coding, attrs, charset_list);
2062
2063   while (1)
2064     {
2065       int c;
2066
2067       src_base = src;
2068       consumed_chars_base = consumed_chars;
2069
2070       if (charbuf >= charbuf_end)
2071         break;
2072
2073       ONE_MORE_BYTE (c);
2074       if (c < 0)
2075         {
2076           *charbuf++ = -c;
2077           char_offset++;
2078         }
2079       else if (c < 0x80)
2080         {
2081           *charbuf++ = c;
2082           char_offset++;
2083         }
2084       else if (c == 0x80)
2085         {
2086           ONE_MORE_BYTE (c);
2087           if (c < 0)
2088             goto invalid_code;
2089           if (c - 0xF2 >= COMPOSITION_RELATIVE
2090               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
2091             DECODE_EMACS_MULE_21_COMPOSITION (c);
2092           else if (c < 0xC0)
2093             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
2094           else if (c == 0xFF)
2095             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
2096           else
2097             goto invalid_code;
2098         }
2099       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
2100         {
2101           int nbytes, nchars;
2102           int id;
2103
2104           src = src_base;
2105           consumed_chars = consumed_chars_base;
2106           c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
2107           if (c < 0)
2108             {
2109               if (c == -2)
2110                 break;
2111               goto invalid_code;
2112             }
2113           if (last_id != id)
2114             {
2115               if (last_id != charset_ascii)
2116                 ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2117               last_id = id;
2118               last_offset = char_offset;
2119             }
2120           *charbuf++ = c;
2121           src += nbytes;
2122           consumed_chars += nchars;
2123           char_offset++;
2124         }
2125       continue;
2126
2127     invalid_code:
2128       src = src_base;
2129       consumed_chars = consumed_chars_base;
2130       ONE_MORE_BYTE (c);
2131       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2132       char_offset++;
2133       coding->errors++;
2134     }
2135
2136  no_more_source:
2137   if (last_id != charset_ascii)
2138     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
2139   coding->consumed_char += consumed_chars_base;
2140   coding->consumed = src_base - coding->source;
2141   coding->charbuf_used = charbuf - coding->charbuf;
2142 }
2143
2144
2145 #define EMACS_MULE_LEADING_CODES(id, codes)     \
2146   do {                                          \
2147     if (id < 0xA0)                              \
2148       codes[0] = id, codes[1] = 0;              \
2149     else if (id < 0xE0)                         \
2150       codes[0] = 0x9A, codes[1] = id;           \
2151     else if (id < 0xF0)                         \
2152       codes[0] = 0x9B, codes[1] = id;           \
2153     else if (id < 0xF5)                         \
2154       codes[0] = 0x9C, codes[1] = id;           \
2155     else                                        \
2156       codes[0] = 0x9D, codes[1] = id;           \
2157   } while (0);
2158
2159
2160 static int
2161 encode_coding_emacs_mule (coding)
2162      struct coding_system *coding;
2163 {
2164   int multibytep = coding->dst_multibyte;
2165   int *charbuf = coding->charbuf;
2166   int *charbuf_end = charbuf + coding->charbuf_used;
2167   unsigned char *dst = coding->destination + coding->produced;
2168   unsigned char *dst_end = coding->destination + coding->dst_bytes;
2169   int safe_room = 8;
2170   int produced_chars = 0;
2171   Lisp_Object attrs, charset_list;
2172   int c;
2173   int preferred_charset_id = -1;
2174
2175   CODING_GET_INFO (coding, attrs, charset_list);
2176   if (! EQ (charset_list, Vemacs_mule_charset_list))
2177     {
2178       CODING_ATTR_CHARSET_LIST (attrs)
2179         = charset_list = Vemacs_mule_charset_list;
2180     }
2181
2182   while (charbuf < charbuf_end)
2183     {
2184       ASSURE_DESTINATION (safe_room);
2185       c = *charbuf++;
2186
2187       if (c < 0)
2188         {
2189           /* Handle an annotation.  */
2190           switch (*charbuf)
2191             {
2192             case CODING_ANNOTATE_COMPOSITION_MASK:
2193               /* Not yet implemented.  */
2194               break;
2195             case CODING_ANNOTATE_CHARSET_MASK:
2196               preferred_charset_id = charbuf[3];
2197               if (preferred_charset_id >= 0
2198                   && NILP (Fmemq (make_number (preferred_charset_id),
2199                                   charset_list)))
2200                 preferred_charset_id = -1;
2201               break;
2202             default:
2203               abort ();
2204             }
2205           charbuf += -c - 1;
2206           continue;
2207         }
2208
2209       if (ASCII_CHAR_P (c))
2210         EMIT_ONE_ASCII_BYTE (c);
2211       else if (CHAR_BYTE8_P (c))
2212         {
2213           c = CHAR_TO_BYTE8 (c);
2214           EMIT_ONE_BYTE (c);
2215         }
2216       else
2217         {
2218           struct charset *charset;
2219           unsigned code;
2220           int dimension;
2221           int emacs_mule_id;
2222           unsigned char leading_codes[2];
2223
2224           if (preferred_charset_id >= 0)
2225             {
2226               charset = CHARSET_FROM_ID (preferred_charset_id);
2227               if (! CHAR_CHARSET_P (c, charset))
2228                 charset = char_charset (c, charset_list, NULL);
2229             }
2230           else
2231             charset = char_charset (c, charset_list, &code);
2232           if (! charset)
2233             {
2234               c = coding->default_char;
2235               if (ASCII_CHAR_P (c))
2236                 {
2237                   EMIT_ONE_ASCII_BYTE (c);
2238                   continue;
2239                 }
2240               charset = char_charset (c, charset_list, &code);
2241             }
2242           dimension = CHARSET_DIMENSION (charset);
2243           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2244           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2245           EMIT_ONE_BYTE (leading_codes[0]);
2246           if (leading_codes[1])
2247             EMIT_ONE_BYTE (leading_codes[1]);
2248           if (dimension == 1)
2249             EMIT_ONE_BYTE (code | 0x80);
2250           else
2251             {
2252               code |= 0x8080;
2253               EMIT_ONE_BYTE (code >> 8);
2254               EMIT_ONE_BYTE (code & 0xFF);
2255             }
2256         }
2257     }
2258   record_conversion_result (coding, CODING_RESULT_SUCCESS);
2259   coding->produced_char += produced_chars;
2260   coding->produced = dst - coding->destination;
2261   return 0;
2262 }
2263
2264 \f
2265 /*** 7. ISO2022 handlers ***/
2266
2267 /* The following note describes the coding system ISO2022 briefly.
2268    Since the intention of this note is to help understand the
2269    functions in this file, some parts are NOT ACCURATE or are OVERLY
2270    SIMPLIFIED.  For thorough understanding, please refer to the
2271    original document of ISO2022.  This is equivalent to the standard
2272    ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2273
2274    ISO2022 provides many mechanisms to encode several character sets
2275    in 7-bit and 8-bit environments.  For 7-bit environments, all text
2276    is encoded using bytes less than 128.  This may make the encoded
2277    text a little bit longer, but the text passes more easily through
2278    several types of gateway, some of which strip off the MSB (Most
2279    Significant Bit).
2280
2281    There are two kinds of character sets: control character sets and
2282    graphic character sets.  The former contain control characters such
2283    as `newline' and `escape' to provide control functions (control
2284    functions are also provided by escape sequences).  The latter
2285    contain graphic characters such as 'A' and '-'.  Emacs recognizes
2286    two control character sets and many graphic character sets.
2287
2288    Graphic character sets are classified into one of the following
2289    four classes, according to the number of bytes (DIMENSION) and
2290    number of characters in one dimension (CHARS) of the set:
2291    - DIMENSION1_CHARS94
2292    - DIMENSION1_CHARS96
2293    - DIMENSION2_CHARS94
2294    - DIMENSION2_CHARS96
2295
2296    In addition, each character set is assigned an identification tag,
2297    unique for each set, called the "final character" (denoted as <F>
2298    hereafter).  The <F> of each character set is decided by ECMA(*)
2299    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2300    (0x30..0x3F are for private use only).
2301
2302    Note (*): ECMA = European Computer Manufacturers Association
2303
2304    Here are examples of graphic character sets [NAME(<F>)]:
2305         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2306         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2307         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2308         o DIMENSION2_CHARS96 -- none for the moment
2309
2310    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2311         C0 [0x00..0x1F] -- control character plane 0
2312         GL [0x20..0x7F] -- graphic character plane 0
2313         C1 [0x80..0x9F] -- control character plane 1
2314         GR [0xA0..0xFF] -- graphic character plane 1
2315
2316    A control character set is directly designated and invoked to C0 or
2317    C1 by an escape sequence.  The most common case is that:
2318    - ISO646's  control character set is designated/invoked to C0, and
2319    - ISO6429's control character set is designated/invoked to C1,
2320    and usually these designations/invocations are omitted in encoded
2321    text.  In a 7-bit environment, only C0 can be used, and a control
2322    character for C1 is encoded by an appropriate escape sequence to
2323    fit into the environment.  All control characters for C1 are
2324    defined to have corresponding escape sequences.
2325
2326    A graphic character set is at first designated to one of four
2327    graphic registers (G0 through G3), then these graphic registers are
2328    invoked to GL or GR.  These designations and invocations can be
2329    done independently.  The most common case is that G0 is invoked to
2330    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2331    these invocations and designations are omitted in encoded text.
2332    In a 7-bit environment, only GL can be used.
2333
2334    When a graphic character set of CHARS94 is invoked to GL, codes
2335    0x20 and 0x7F of the GL area work as control characters SPACE and
2336    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2337    be used.
2338
2339    There are two ways of invocation: locking-shift and single-shift.
2340    With locking-shift, the invocation lasts until the next different
2341    invocation, whereas with single-shift, the invocation affects the
2342    following character only and doesn't affect the locking-shift
2343    state.  Invocations are done by the following control characters or
2344    escape sequences:
2345
2346    ----------------------------------------------------------------------
2347    abbrev  function                  cntrl escape seq   description
2348    ----------------------------------------------------------------------
2349    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2350    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2351    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2352    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2353    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2354    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2355    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2356    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2357    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2358    ----------------------------------------------------------------------
2359    (*) These are not used by any known coding system.
2360
2361    Control characters for these functions are defined by macros
2362    ISO_CODE_XXX in `coding.h'.
2363
2364    Designations are done by the following escape sequences:
2365    ----------------------------------------------------------------------
2366    escape sequence      description
2367    ----------------------------------------------------------------------
2368    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2369    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2370    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2371    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2372    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2373    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2374    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2375    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2376    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2377    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2378    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2379    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2380    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2381    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2382    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2383    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2384    ----------------------------------------------------------------------
2385
2386    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2387    of dimension 1, chars 94, and final character <F>, etc...
2388
2389    Note (*): Although these designations are not allowed in ISO2022,
2390    Emacs accepts them on decoding, and produces them on encoding
2391    CHARS96 character sets in a coding system which is characterized as
2392    7-bit environment, non-locking-shift, and non-single-shift.
2393
2394    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2395    '(' must be omitted.  We refer to this as "short-form" hereafter.
2396
2397    Now you may notice that there are a lot of ways of encoding the
2398    same multilingual text in ISO2022.  Actually, there exist many
2399    coding systems such as Compound Text (used in X11's inter client
2400    communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2401    (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2402    localized platforms), and all of these are variants of ISO2022.
2403
2404    In addition to the above, Emacs handles two more kinds of escape
2405    sequences: ISO6429's direction specification and Emacs' private
2406    sequence for specifying character composition.
2407
2408    ISO6429's direction specification takes the following form:
2409         o CSI ']'      -- end of the current direction
2410         o CSI '0' ']'  -- end of the current direction
2411         o CSI '1' ']'  -- start of left-to-right text
2412         o CSI '2' ']'  -- start of right-to-left text
2413    The control character CSI (0x9B: control sequence introducer) is
2414    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2415
2416    Character composition specification takes the following form:
2417         o ESC '0' -- start relative composition
2418         o ESC '1' -- end composition
2419         o ESC '2' -- start rule-base composition (*)
2420         o ESC '3' -- start relative composition with alternate chars  (**)
2421         o ESC '4' -- start rule-base composition with alternate chars  (**)
2422   Since these are not standard escape sequences of any ISO standard,
2423   the use of them with these meanings is restricted to Emacs only.
2424
2425   (*) This form is used only in Emacs 20.7 and older versions,
2426   but newer versions can safely decode it.
2427   (**) This form is used only in Emacs 21.1 and newer versions,
2428   and older versions can't decode it.
2429
2430   Here's a list of example usages of these composition escape
2431   sequences (categorized by `enum composition_method').
2432
2433   COMPOSITION_RELATIVE:
2434         ESC 0 CHAR [ CHAR ] ESC 1
2435   COMPOSITION_WITH_RULE:
2436         ESC 2 CHAR [ RULE CHAR ] ESC 1
2437   COMPOSITION_WITH_ALTCHARS:
2438         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2439   COMPOSITION_WITH_RULE_ALTCHARS:
2440         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2441
2442 enum iso_code_class_type iso_code_class[256];
2443
2444 #define SAFE_CHARSET_P(coding, id)      \
2445   ((id) <= (coding)->max_charset_id     \
2446    && (coding)->safe_charsets[id] >= 0)
2447
2448
2449 #define SHIFT_OUT_OK(category)  \
2450   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2451
2452 static void
2453 setup_iso_safe_charsets (attrs)
2454      Lisp_Object attrs;
2455 {
2456   Lisp_Object charset_list, safe_charsets;
2457   Lisp_Object request;
2458   Lisp_Object reg_usage;
2459   Lisp_Object tail;
2460   int reg94, reg96;
2461   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2462   int max_charset_id;
2463
2464   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2465   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2466       && ! EQ (charset_list, Viso_2022_charset_list))
2467     {
2468       CODING_ATTR_CHARSET_LIST (attrs)
2469         = charset_list = Viso_2022_charset_list;
2470       ASET (attrs, coding_attr_safe_charsets, Qnil);
2471     }
2472
2473   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2474     return;
2475
2476   max_charset_id = 0;
2477   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2478     {
2479       int id = XINT (XCAR (tail));
2480       if (max_charset_id < id)
2481         max_charset_id = id;
2482     }
2483
2484   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2485                                 make_number (255));
2486   request = AREF (attrs, coding_attr_iso_request);
2487   reg_usage = AREF (attrs, coding_attr_iso_usage);
2488   reg94 = XINT (XCAR (reg_usage));
2489   reg96 = XINT (XCDR (reg_usage));
2490
2491   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2492     {
2493       Lisp_Object id;
2494       Lisp_Object reg;
2495       struct charset *charset;
2496
2497       id = XCAR (tail);
2498       charset = CHARSET_FROM_ID (XINT (id));
2499       reg = Fcdr (Fassq (id, request));
2500       if (! NILP (reg))
2501         SSET (safe_charsets, XINT (id), XINT (reg));
2502       else if (charset->iso_chars_96)
2503         {
2504           if (reg96 < 4)
2505             SSET (safe_charsets, XINT (id), reg96);
2506         }
2507       else
2508         {
2509           if (reg94 < 4)
2510             SSET (safe_charsets, XINT (id), reg94);
2511         }
2512     }
2513   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2514 }
2515
2516
2517 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2518    Check if a text is encoded in one of ISO-2022 based codig systems.
2519    If it is, return 1, else return 0.  */
2520
2521 static int
2522 detect_coding_iso_2022 (coding, detect_info)
2523      struct coding_system *coding;
2524      struct coding_detection_info *detect_info;
2525 {
2526   const unsigned char *src = coding->source, *src_base = src;
2527   const unsigned char *src_end = coding->source + coding->src_bytes;
2528   int multibytep = coding->src_multibyte;
2529   int single_shifting = 0;
2530   int id;
2531   int c, c1;
2532   int consumed_chars = 0;
2533   int i;
2534   int rejected = 0;
2535   int found = 0;
2536
2537   detect_info->checked |= CATEGORY_MASK_ISO;
2538
2539   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2540     {
2541       struct coding_system *this = &(coding_categories[i]);
2542       Lisp_Object attrs, val;
2543
2544       attrs = CODING_ID_ATTRS (this->id);
2545       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2546           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2547         setup_iso_safe_charsets (attrs);
2548       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2549       this->max_charset_id = SCHARS (val) - 1;
2550       this->safe_charsets = (char *) SDATA (val);
2551     }
2552
2553   /* A coding system of this category is always ASCII compatible.  */
2554   src += coding->head_ascii;
2555
2556   while (rejected != CATEGORY_MASK_ISO)
2557     {
2558       src_base = src;
2559       ONE_MORE_BYTE (c);
2560       switch (c)
2561         {
2562         case ISO_CODE_ESC:
2563           if (inhibit_iso_escape_detection)
2564             break;
2565           single_shifting = 0;
2566           ONE_MORE_BYTE (c);
2567           if (c >= '(' && c <= '/')
2568             {
2569               /* Designation sequence for a charset of dimension 1.  */
2570               ONE_MORE_BYTE (c1);
2571               if (c1 < ' ' || c1 >= 0x80
2572                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2573                 /* Invalid designation sequence.  Just ignore.  */
2574                 break;
2575             }
2576           else if (c == '$')
2577             {
2578               /* Designation sequence for a charset of dimension 2.  */
2579               ONE_MORE_BYTE (c);
2580               if (c >= '@' && c <= 'B')
2581                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2582                 id = iso_charset_table[1][0][c];
2583               else if (c >= '(' && c <= '/')
2584                 {
2585                   ONE_MORE_BYTE (c1);
2586                   if (c1 < ' ' || c1 >= 0x80
2587                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2588                     /* Invalid designation sequence.  Just ignore.  */
2589                     break;
2590                 }
2591               else
2592                 /* Invalid designation sequence.  Just ignore it.  */
2593                 break;
2594             }
2595           else if (c == 'N' || c == 'O')
2596             {
2597               /* ESC <Fe> for SS2 or SS3.  */
2598               single_shifting = 1;
2599               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2600               break;
2601             }
2602           else if (c >= '0' && c <= '4')
2603             {
2604               /* ESC <Fp> for start/end composition.  */
2605               found |= CATEGORY_MASK_ISO;
2606               break;
2607             }
2608           else
2609             {
2610               /* Invalid escape sequence.  Just ignore it.  */
2611               break;
2612             }
2613
2614           /* We found a valid designation sequence for CHARSET.  */
2615           rejected |= CATEGORY_MASK_ISO_8BIT;
2616           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2617                               id))
2618             found |= CATEGORY_MASK_ISO_7;
2619           else
2620             rejected |= CATEGORY_MASK_ISO_7;
2621           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2622                               id))
2623             found |= CATEGORY_MASK_ISO_7_TIGHT;
2624           else
2625             rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2626           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2627                               id))
2628             found |= CATEGORY_MASK_ISO_7_ELSE;
2629           else
2630             rejected |= CATEGORY_MASK_ISO_7_ELSE;
2631           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2632                               id))
2633             found |= CATEGORY_MASK_ISO_8_ELSE;
2634           else
2635             rejected |= CATEGORY_MASK_ISO_8_ELSE;
2636           break;
2637
2638         case ISO_CODE_SO:
2639         case ISO_CODE_SI:
2640           /* Locking shift out/in.  */
2641           if (inhibit_iso_escape_detection)
2642             break;
2643           single_shifting = 0;
2644           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2645           found |= CATEGORY_MASK_ISO_ELSE;
2646           break;
2647
2648         case ISO_CODE_CSI:
2649           /* Control sequence introducer.  */
2650           single_shifting = 0;
2651           rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2652           found |= CATEGORY_MASK_ISO_8_ELSE;
2653           goto check_extra_latin;
2654
2655         case ISO_CODE_SS2:
2656         case ISO_CODE_SS3:
2657           /* Single shift.   */
2658           if (inhibit_iso_escape_detection)
2659             break;
2660           single_shifting = 0;
2661           rejected |= CATEGORY_MASK_ISO_7BIT;
2662           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2663               & CODING_ISO_FLAG_SINGLE_SHIFT)
2664             found |= CATEGORY_MASK_ISO_8_1, single_shifting = 1;
2665           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2666               & CODING_ISO_FLAG_SINGLE_SHIFT)
2667             found |= CATEGORY_MASK_ISO_8_2, single_shifting = 1;
2668           if (single_shifting)
2669             break;
2670           goto check_extra_latin;
2671
2672         default:
2673           if (c < 0)
2674             continue;
2675           if (c < 0x80)
2676             {
2677               single_shifting = 0;
2678               break;
2679             }
2680           if (c >= 0xA0)
2681             {
2682               rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2683               found |= CATEGORY_MASK_ISO_8_1;
2684               /* Check the length of succeeding codes of the range
2685                  0xA0..0FF.  If the byte length is even, we include
2686                  CATEGORY_MASK_ISO_8_2 in `found'.  We can check this
2687                  only when we are not single shifting.  */
2688               if (! single_shifting
2689                   && ! (rejected & CATEGORY_MASK_ISO_8_2))
2690                 {
2691                   int i = 1;
2692                   while (src < src_end)
2693                     {
2694                       ONE_MORE_BYTE (c);
2695                       if (c < 0xA0)
2696                         break;
2697                       i++;
2698                     }
2699
2700                   if (i & 1 && src < src_end)
2701                     rejected |= CATEGORY_MASK_ISO_8_2;
2702                   else
2703                     found |= CATEGORY_MASK_ISO_8_2;
2704                 }
2705               break;
2706             }
2707         check_extra_latin:
2708           single_shifting = 0;
2709           if (! VECTORP (Vlatin_extra_code_table)
2710               || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2711             {
2712               rejected = CATEGORY_MASK_ISO;
2713               break;
2714             }
2715           if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2716               & CODING_ISO_FLAG_LATIN_EXTRA)
2717             found |= CATEGORY_MASK_ISO_8_1;
2718           else
2719             rejected |= CATEGORY_MASK_ISO_8_1;
2720           rejected |= CATEGORY_MASK_ISO_8_2;
2721         }
2722     }
2723   detect_info->rejected |= CATEGORY_MASK_ISO;
2724   return 0;
2725
2726  no_more_source:
2727   detect_info->rejected |= rejected;
2728   detect_info->found |= (found & ~rejected);
2729   return 1;
2730 }
2731
2732
2733 /* Set designation state into CODING.  Set CHARS_96 to -1 if the
2734    escape sequence should be kept.  */
2735 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2736   do {                                                                  \
2737     int id, prev;                                                       \
2738                                                                         \
2739     if (final < '0' || final >= 128                                     \
2740         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2741         || !SAFE_CHARSET_P (coding, id))                                \
2742       {                                                                 \
2743         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2744         chars_96 = -1;                                                  \
2745         break;                                                          \
2746       }                                                                 \
2747     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2748     if (id == charset_jisx0201_roman)                                   \
2749       {                                                                 \
2750         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2751           id = charset_ascii;                                           \
2752       }                                                                 \
2753     else if (id == charset_jisx0208_1978)                               \
2754       {                                                                 \
2755         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2756           id = charset_jisx0208;                                        \
2757       }                                                                 \
2758     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2759     /* If there was an invalid designation to REG previously, and this  \
2760        designation is ASCII to REG, we should keep this designation     \
2761        sequence.  */                                                    \
2762     if (prev == -2 && id == charset_ascii)                              \
2763       chars_96 = -1;                                                    \
2764   } while (0)
2765
2766
2767 #define MAYBE_FINISH_COMPOSITION()                              \
2768   do {                                                          \
2769     int i;                                                      \
2770     if (composition_state == COMPOSING_NO)                      \
2771       break;                                                    \
2772     /* It is assured that we have enough room for producing     \
2773        characters stored in the table `components'.  */         \
2774     if (charbuf + component_idx > charbuf_end)                  \
2775       goto no_more_source;                                      \
2776     composition_state = COMPOSING_NO;                           \
2777     if (method == COMPOSITION_RELATIVE                          \
2778         || method == COMPOSITION_WITH_ALTCHARS)                 \
2779       {                                                         \
2780         for (i = 0; i < component_idx; i++)                     \
2781           *charbuf++ = components[i];                           \
2782         char_offset += component_idx;                           \
2783       }                                                         \
2784     else                                                        \
2785       {                                                         \
2786         for (i = 0; i < component_idx; i += 2)                  \
2787           *charbuf++ = components[i];                           \
2788         char_offset += (component_idx / 2) + 1;                 \
2789       }                                                         \
2790   } while (0)
2791
2792
2793 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2794    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2795    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2796    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2797    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2798   */
2799
2800 #define DECODE_COMPOSITION_START(c1)                                    \
2801   do {                                                                  \
2802     if (c1 == '0'                                                       \
2803         && composition_state == COMPOSING_COMPONENT_RULE)               \
2804       {                                                                 \
2805         component_len = component_idx;                                  \
2806         composition_state = COMPOSING_CHAR;                             \
2807       }                                                                 \
2808     else                                                                \
2809       {                                                                 \
2810         const unsigned char *p;                                         \
2811                                                                         \
2812         MAYBE_FINISH_COMPOSITION ();                                    \
2813         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2814           goto no_more_source;                                          \
2815         for (p = src; p < src_end - 1; p++)                             \
2816           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2817             break;                                                      \
2818         if (p == src_end - 1)                                           \
2819           {                                                             \
2820             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2821               goto invalid_code;                                        \
2822             goto no_more_source;                                        \
2823           }                                                             \
2824                                                                         \
2825         /* This is surely the start of a composition.  */               \
2826         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2827                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2828                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2829                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2830         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2831                              : COMPOSING_COMPONENT_CHAR);               \
2832         component_idx = component_len = 0;                              \
2833       }                                                                 \
2834   } while (0)
2835
2836
2837 /* Handle compositoin end sequence ESC 1.  */
2838
2839 #define DECODE_COMPOSITION_END()                                        \
2840   do {                                                                  \
2841     int nchars = (component_len > 0 ? component_idx - component_len     \
2842                   : method == COMPOSITION_RELATIVE ? component_idx      \
2843                   : (component_idx + 1) / 2);                           \
2844     int i;                                                              \
2845     int *saved_charbuf = charbuf;                                       \
2846                                                                         \
2847     ADD_COMPOSITION_DATA (charbuf, nchars, method);                     \
2848     if (method != COMPOSITION_RELATIVE)                                 \
2849       {                                                                 \
2850         if (component_len == 0)                                         \
2851           for (i = 0; i < component_idx; i++)                           \
2852             *charbuf++ = components[i];                                 \
2853         else                                                            \
2854           for (i = 0; i < component_len; i++)                           \
2855             *charbuf++ = components[i];                                 \
2856         *saved_charbuf = saved_charbuf - charbuf;                       \
2857       }                                                                 \
2858     if (method == COMPOSITION_WITH_RULE)                                \
2859       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2860         *charbuf++ = components[i];                                     \
2861     else                                                                \
2862       for (i = component_len; i < component_idx; i++, char_offset++)    \
2863         *charbuf++ = components[i];                                     \
2864     coding->annotated = 1;                                              \
2865     composition_state = COMPOSING_NO;                                   \
2866   } while (0)
2867
2868
2869 /* Decode a composition rule from the byte C1 (and maybe one more byte
2870    from SRC) and store one encoded composition rule in
2871    coding->cmp_data.  */
2872
2873 #define DECODE_COMPOSITION_RULE(c1)                                     \
2874   do {                                                                  \
2875     (c1) -= 32;                                                         \
2876     if (c1 < 81)                /* old format (before ver.21) */        \
2877       {                                                                 \
2878         int gref = (c1) / 9;                                            \
2879         int nref = (c1) % 9;                                            \
2880         if (gref == 4) gref = 10;                                       \
2881         if (nref == 4) nref = 10;                                       \
2882         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2883       }                                                                 \
2884     else if (c1 < 93)           /* new format (after ver.21) */         \
2885       {                                                                 \
2886         ONE_MORE_BYTE (c2);                                             \
2887         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2888       }                                                                 \
2889     else                                                                \
2890       c1 = 0;                                                           \
2891   } while (0)
2892
2893
2894 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2895
2896 static void
2897 decode_coding_iso_2022 (coding)
2898      struct coding_system *coding;
2899 {
2900   const unsigned char *src = coding->source + coding->consumed;
2901   const unsigned char *src_end = coding->source + coding->src_bytes;
2902   const unsigned char *src_base;
2903   int *charbuf = coding->charbuf + coding->charbuf_used;
2904   int *charbuf_end
2905     = coding->charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2906   int consumed_chars = 0, consumed_chars_base;
2907   int multibytep = coding->src_multibyte;
2908   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2909   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2910   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2911   int charset_id_2, charset_id_3;
2912   struct charset *charset;
2913   int c;
2914   /* For handling composition sequence.  */
2915 #define COMPOSING_NO                    0
2916 #define COMPOSING_CHAR                  1
2917 #define COMPOSING_RULE                  2
2918 #define COMPOSING_COMPONENT_CHAR        3
2919 #define COMPOSING_COMPONENT_RULE        4
2920
2921   int composition_state = COMPOSING_NO;
2922   enum composition_method method;
2923   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2924   int component_idx;
2925   int component_len;
2926   Lisp_Object attrs, charset_list;
2927   int char_offset = coding->produced_char;
2928   int last_offset = char_offset;
2929   int last_id = charset_ascii;
2930
2931   CODING_GET_INFO (coding, attrs, charset_list);
2932   setup_iso_safe_charsets (attrs);
2933
2934   while (1)
2935     {
2936       int c1, c2;
2937
2938       src_base = src;
2939       consumed_chars_base = consumed_chars;
2940
2941       if (charbuf >= charbuf_end)
2942         break;
2943
2944       ONE_MORE_BYTE (c1);
2945       if (c1 < 0)
2946         goto invalid_code;
2947
2948       /* We produce at most one character.  */
2949       switch (iso_code_class [c1])
2950         {
2951         case ISO_0x20_or_0x7F:
2952           if (composition_state != COMPOSING_NO)
2953             {
2954               if (composition_state == COMPOSING_RULE
2955                   || composition_state == COMPOSING_COMPONENT_RULE)
2956                 {
2957                   DECODE_COMPOSITION_RULE (c1);
2958                   components[component_idx++] = c1;
2959                   composition_state--;
2960                   continue;
2961                 }
2962             }
2963           if (charset_id_0 < 0
2964               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2965             /* This is SPACE or DEL.  */
2966             charset = CHARSET_FROM_ID (charset_ascii);
2967           else
2968             charset = CHARSET_FROM_ID (charset_id_0);
2969           break;
2970
2971         case ISO_graphic_plane_0:
2972           if (composition_state != COMPOSING_NO)
2973             {
2974               if (composition_state == COMPOSING_RULE
2975                   || composition_state == COMPOSING_COMPONENT_RULE)
2976                 {
2977                   DECODE_COMPOSITION_RULE (c1);
2978                   components[component_idx++] = c1;
2979                   composition_state--;
2980                   continue;
2981                 }
2982             }
2983           if (charset_id_0 < 0)
2984             charset = CHARSET_FROM_ID (charset_ascii);
2985           else
2986             charset = CHARSET_FROM_ID (charset_id_0);
2987           break;
2988
2989         case ISO_0xA0_or_0xFF:
2990           if (charset_id_1 < 0
2991               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2992               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2993             goto invalid_code;
2994           /* This is a graphic character, we fall down ... */
2995
2996         case ISO_graphic_plane_1:
2997           if (charset_id_1 < 0)
2998             goto invalid_code;
2999           charset = CHARSET_FROM_ID (charset_id_1);
3000           break;
3001
3002         case ISO_control_0:
3003           MAYBE_FINISH_COMPOSITION ();
3004           charset = CHARSET_FROM_ID (charset_ascii);
3005           break;
3006
3007         case ISO_control_1:
3008           MAYBE_FINISH_COMPOSITION ();
3009           goto invalid_code;
3010
3011         case ISO_shift_out:
3012           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3013               || CODING_ISO_DESIGNATION (coding, 1) < 0)
3014             goto invalid_code;
3015           CODING_ISO_INVOCATION (coding, 0) = 1;
3016           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3017           continue;
3018
3019         case ISO_shift_in:
3020           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
3021             goto invalid_code;
3022           CODING_ISO_INVOCATION (coding, 0) = 0;
3023           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3024           continue;
3025
3026         case ISO_single_shift_2_7:
3027         case ISO_single_shift_2:
3028           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3029             goto invalid_code;
3030           /* SS2 is handled as an escape sequence of ESC 'N' */
3031           c1 = 'N';
3032           goto label_escape_sequence;
3033
3034         case ISO_single_shift_3:
3035           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
3036             goto invalid_code;
3037           /* SS2 is handled as an escape sequence of ESC 'O' */
3038           c1 = 'O';
3039           goto label_escape_sequence;
3040
3041         case ISO_control_sequence_introducer:
3042           /* CSI is handled as an escape sequence of ESC '[' ...  */
3043           c1 = '[';
3044           goto label_escape_sequence;
3045
3046         case ISO_escape:
3047           ONE_MORE_BYTE (c1);
3048         label_escape_sequence:
3049           /* Escape sequences handled here are invocation,
3050              designation, direction specification, and character
3051              composition specification.  */
3052           switch (c1)
3053             {
3054             case '&':           /* revision of following character set */
3055               ONE_MORE_BYTE (c1);
3056               if (!(c1 >= '@' && c1 <= '~'))
3057                 goto invalid_code;
3058               ONE_MORE_BYTE (c1);
3059               if (c1 != ISO_CODE_ESC)
3060                 goto invalid_code;
3061               ONE_MORE_BYTE (c1);
3062               goto label_escape_sequence;
3063
3064             case '$':           /* designation of 2-byte character set */
3065               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3066                 goto invalid_code;
3067               {
3068                 int reg, chars96;
3069
3070                 ONE_MORE_BYTE (c1);
3071                 if (c1 >= '@' && c1 <= 'B')
3072                   {     /* designation of JISX0208.1978, GB2312.1980,
3073                            or JISX0208.1980 */
3074                     reg = 0, chars96 = 0;
3075                   }
3076                 else if (c1 >= 0x28 && c1 <= 0x2B)
3077                   { /* designation of DIMENSION2_CHARS94 character set */
3078                     reg = c1 - 0x28, chars96 = 0;
3079                     ONE_MORE_BYTE (c1);
3080                   }
3081                 else if (c1 >= 0x2C && c1 <= 0x2F)
3082                   { /* designation of DIMENSION2_CHARS96 character set */
3083                     reg = c1 - 0x2C, chars96 = 1;
3084                     ONE_MORE_BYTE (c1);
3085                   }
3086                 else
3087                   goto invalid_code;
3088                 DECODE_DESIGNATION (reg, 2, chars96, c1);
3089                 /* We must update these variables now.  */
3090                 if (reg == 0)
3091                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3092                 else if (reg == 1)
3093                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3094                 if (chars96 < 0)
3095                   goto invalid_code;
3096               }
3097               continue;
3098
3099             case 'n':           /* invocation of locking-shift-2 */
3100               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3101                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3102                 goto invalid_code;
3103               CODING_ISO_INVOCATION (coding, 0) = 2;
3104               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3105               continue;
3106
3107             case 'o':           /* invocation of locking-shift-3 */
3108               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
3109                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3110                 goto invalid_code;
3111               CODING_ISO_INVOCATION (coding, 0) = 3;
3112               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3113               continue;
3114
3115             case 'N':           /* invocation of single-shift-2 */
3116               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3117                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
3118                 goto invalid_code;
3119               charset_id_2 = CODING_ISO_DESIGNATION (coding, 2);
3120               if (charset_id_2 < 0)
3121                 charset = CHARSET_FROM_ID (charset_ascii);
3122               else
3123                 charset = CHARSET_FROM_ID (charset_id_2);
3124               ONE_MORE_BYTE (c1);
3125               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3126                 goto invalid_code;
3127               break;
3128
3129             case 'O':           /* invocation of single-shift-3 */
3130               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3131                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
3132                 goto invalid_code;
3133               charset_id_3 = CODING_ISO_DESIGNATION (coding, 3);
3134               if (charset_id_3 < 0)
3135                 charset = CHARSET_FROM_ID (charset_ascii);
3136               else
3137                 charset = CHARSET_FROM_ID (charset_id_3);
3138               ONE_MORE_BYTE (c1);
3139               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3140                 goto invalid_code;
3141               break;
3142
3143             case '0': case '2': case '3': case '4': /* start composition */
3144               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3145                 goto invalid_code;
3146               DECODE_COMPOSITION_START (c1);
3147               continue;
3148
3149             case '1':           /* end composition */
3150               if (composition_state == COMPOSING_NO)
3151                 goto invalid_code;
3152               DECODE_COMPOSITION_END ();
3153               continue;
3154
3155             case '[':           /* specification of direction */
3156               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3157                 goto invalid_code;
3158               /* For the moment, nested direction is not supported.
3159                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
3160                  left-to-right, and nozero means right-to-left.  */
3161               ONE_MORE_BYTE (c1);
3162               switch (c1)
3163                 {
3164                 case ']':       /* end of the current direction */
3165                   coding->mode &= ~CODING_MODE_DIRECTION;
3166
3167                 case '0':       /* end of the current direction */
3168                 case '1':       /* start of left-to-right direction */
3169                   ONE_MORE_BYTE (c1);
3170                   if (c1 == ']')
3171                     coding->mode &= ~CODING_MODE_DIRECTION;
3172                   else
3173                     goto invalid_code;
3174                   break;
3175
3176                 case '2':       /* start of right-to-left direction */
3177                   ONE_MORE_BYTE (c1);
3178                   if (c1 == ']')
3179                     coding->mode |= CODING_MODE_DIRECTION;
3180                   else
3181                     goto invalid_code;
3182                   break;
3183
3184                 default:
3185                   goto invalid_code;
3186                 }
3187               continue;
3188
3189             case '%':
3190               ONE_MORE_BYTE (c1);
3191               if (c1 == '/')
3192                 {
3193                   /* CTEXT extended segment:
3194                      ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3195                      We keep these bytes as is for the moment.
3196                      They may be decoded by post-read-conversion.  */
3197                   int dim, M, L;
3198                   int size;
3199
3200                   ONE_MORE_BYTE (dim);
3201                   ONE_MORE_BYTE (M);
3202                   ONE_MORE_BYTE (L);
3203                   size = ((M - 128) * 128) + (L - 128);
3204                   if (charbuf + 8 + size > charbuf_end)
3205                     goto break_loop;
3206                   *charbuf++ = ISO_CODE_ESC;
3207                   *charbuf++ = '%';
3208                   *charbuf++ = '/';
3209                   *charbuf++ = dim;
3210                   *charbuf++ = BYTE8_TO_CHAR (M);
3211                   *charbuf++ = BYTE8_TO_CHAR (L);
3212                   while (size-- > 0)
3213                     {
3214                       ONE_MORE_BYTE (c1);
3215                       *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3216                     }
3217                 }
3218               else if (c1 == 'G')
3219                 {
3220                   /* XFree86 extension for embedding UTF-8 in CTEXT:
3221                      ESC % G --UTF-8-BYTES-- ESC % @
3222                      We keep these bytes as is for the moment.
3223                      They may be decoded by post-read-conversion.  */
3224                   int *p = charbuf;
3225
3226                   if (p + 6 > charbuf_end)
3227                     goto break_loop;
3228                   *p++ = ISO_CODE_ESC;
3229                   *p++ = '%';
3230                   *p++ = 'G';
3231                   while (p < charbuf_end)
3232                     {
3233                       ONE_MORE_BYTE (c1);
3234                       if (c1 == ISO_CODE_ESC
3235                           && src + 1 < src_end
3236                           && src[0] == '%'
3237                           && src[1] == '@')
3238                         {
3239                           src += 2;
3240                           break;
3241                         }
3242                       *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3243                     }
3244                   if (p + 3 > charbuf_end)
3245                     goto break_loop;
3246                   *p++ = ISO_CODE_ESC;
3247                   *p++ = '%';
3248                   *p++ = '@';
3249                   charbuf = p;
3250                 }
3251               else
3252                 goto invalid_code;
3253               continue;
3254               break;
3255
3256             default:
3257               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3258                 goto invalid_code;
3259               {
3260                 int reg, chars96;
3261
3262                 if (c1 >= 0x28 && c1 <= 0x2B)
3263                   { /* designation of DIMENSION1_CHARS94 character set */
3264                     reg = c1 - 0x28, chars96 = 0;
3265                     ONE_MORE_BYTE (c1);
3266                   }
3267                 else if (c1 >= 0x2C && c1 <= 0x2F)
3268                   { /* designation of DIMENSION1_CHARS96 character set */
3269                     reg = c1 - 0x2C, chars96 = 1;
3270                     ONE_MORE_BYTE (c1);
3271                   }
3272                 else
3273                   goto invalid_code;
3274                 DECODE_DESIGNATION (reg, 1, chars96, c1);
3275                 /* We must update these variables now.  */
3276                 if (reg == 0)
3277                   charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3278                 else if (reg == 1)
3279                   charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3280                 if (chars96 < 0)
3281                   goto invalid_code;
3282               }
3283               continue;
3284             }
3285         }
3286
3287       if (charset->id != charset_ascii
3288           && last_id != charset->id)
3289         {
3290           if (last_id != charset_ascii)
3291             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3292           last_id = charset->id;
3293           last_offset = char_offset;
3294         }
3295
3296       /* Now we know CHARSET and 1st position code C1 of a character.
3297          Produce a decoded character while getting 2nd position code
3298          C2 if necessary.  */
3299       c1 &= 0x7F;
3300       if (CHARSET_DIMENSION (charset) > 1)
3301         {
3302           ONE_MORE_BYTE (c2);
3303           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3304             /* C2 is not in a valid range.  */
3305             goto invalid_code;
3306           c1 = (c1 << 8) | (c2 & 0x7F);
3307           if (CHARSET_DIMENSION (charset) > 2)
3308             {
3309               ONE_MORE_BYTE (c2);
3310               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3311                 /* C2 is not in a valid range.  */
3312                 goto invalid_code;
3313               c1 = (c1 << 8) | (c2 & 0x7F);
3314             }
3315         }
3316
3317       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3318       if (c < 0)
3319         {
3320           MAYBE_FINISH_COMPOSITION ();
3321           for (; src_base < src; src_base++, char_offset++)
3322             {
3323               if (ASCII_BYTE_P (*src_base))
3324                 *charbuf++ = *src_base;
3325               else
3326                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3327             }
3328         }
3329       else if (composition_state == COMPOSING_NO)
3330         {
3331           *charbuf++ = c;
3332           char_offset++;
3333         }
3334       else
3335         {
3336           components[component_idx++] = c;
3337           if (method == COMPOSITION_WITH_RULE
3338               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3339                   && composition_state == COMPOSING_COMPONENT_CHAR))
3340             composition_state++;
3341         }
3342       continue;
3343
3344     invalid_code:
3345       MAYBE_FINISH_COMPOSITION ();
3346       src = src_base;
3347       consumed_chars = consumed_chars_base;
3348       ONE_MORE_BYTE (c);
3349       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3350       char_offset++;
3351       coding->errors++;
3352       continue;
3353
3354     break_loop:
3355       break;
3356     }
3357
3358  no_more_source:
3359   if (last_id != charset_ascii)
3360     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
3361   coding->consumed_char += consumed_chars_base;
3362   coding->consumed = src_base - coding->source;
3363   coding->charbuf_used = charbuf - coding->charbuf;
3364 }
3365
3366
3367 /* ISO2022 encoding stuff.  */
3368
3369 /*
3370    It is not enough to say just "ISO2022" on encoding, we have to
3371    specify more details.  In Emacs, each coding system of ISO2022
3372    variant has the following specifications:
3373         1. Initial designation to G0 thru G3.
3374         2. Allows short-form designation?
3375         3. ASCII should be designated to G0 before control characters?
3376         4. ASCII should be designated to G0 at end of line?
3377         5. 7-bit environment or 8-bit environment?
3378         6. Use locking-shift?
3379         7. Use Single-shift?
3380    And the following two are only for Japanese:
3381         8. Use ASCII in place of JIS0201-1976-Roman?
3382         9. Use JISX0208-1983 in place of JISX0208-1978?
3383    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3384    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3385    details.
3386 */
3387
3388 /* Produce codes (escape sequence) for designating CHARSET to graphic
3389    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3390    '@', 'A', or 'B' and the coding system CODING allows, produce
3391    designation sequence of short-form.  */
3392
3393 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3394   do {                                                                  \
3395     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3396     char *intermediate_char_94 = "()*+";                                \
3397     char *intermediate_char_96 = ",-./";                                \
3398     int revision = -1;                                                  \
3399     int c;                                                              \
3400                                                                         \
3401     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3402       revision = CHARSET_ISO_REVISION (charset);                        \
3403                                                                         \
3404     if (revision >= 0)                                                  \
3405       {                                                                 \
3406         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3407         EMIT_ONE_BYTE ('@' + revision);                                 \
3408       }                                                                 \
3409     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3410     if (CHARSET_DIMENSION (charset) == 1)                               \
3411       {                                                                 \
3412         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3413           c = intermediate_char_94[reg];                                \
3414         else                                                            \
3415           c = intermediate_char_96[reg];                                \
3416         EMIT_ONE_ASCII_BYTE (c);                                        \
3417       }                                                                 \
3418     else                                                                \
3419       {                                                                 \
3420         EMIT_ONE_ASCII_BYTE ('$');                                      \
3421         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3422           {                                                             \
3423             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3424                 || reg != 0                                             \
3425                 || final_char < '@' || final_char > 'B')                \
3426               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3427           }                                                             \
3428         else                                                            \
3429           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3430       }                                                                 \
3431     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3432                                                                         \
3433     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3434   } while (0)
3435
3436
3437 /* The following two macros produce codes (control character or escape
3438    sequence) for ISO2022 single-shift functions (single-shift-2 and
3439    single-shift-3).  */
3440
3441 #define ENCODE_SINGLE_SHIFT_2                                           \
3442   do {                                                                  \
3443     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3444       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3445     else                                                                \
3446       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3447     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3448   } while (0)
3449
3450
3451 #define ENCODE_SINGLE_SHIFT_3                                           \
3452   do {                                                                  \
3453     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3454       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3455     else                                                                \
3456       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3457     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3458   } while (0)
3459
3460
3461 /* The following four macros produce codes (control character or
3462    escape sequence) for ISO2022 locking-shift functions (shift-in,
3463    shift-out, locking-shift-2, and locking-shift-3).  */
3464
3465 #define ENCODE_SHIFT_IN                                 \
3466   do {                                                  \
3467     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3468     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3469   } while (0)
3470
3471
3472 #define ENCODE_SHIFT_OUT                                \
3473   do {                                                  \
3474     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3475     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3476   } while (0)
3477
3478
3479 #define ENCODE_LOCKING_SHIFT_2                          \
3480   do {                                                  \
3481     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3482     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3483   } while (0)
3484
3485
3486 #define ENCODE_LOCKING_SHIFT_3                          \
3487   do {                                                  \
3488     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3489     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3490   } while (0)
3491
3492
3493 /* Produce codes for a DIMENSION1 character whose character set is
3494    CHARSET and whose position-code is C1.  Designation and invocation
3495    sequences are also produced in advance if necessary.  */
3496
3497 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3498   do {                                                                  \
3499     int id = CHARSET_ID (charset);                                      \
3500                                                                         \
3501     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3502         && id == charset_ascii)                                         \
3503       {                                                                 \
3504         id = charset_jisx0201_roman;                                    \
3505         charset = CHARSET_FROM_ID (id);                                 \
3506       }                                                                 \
3507                                                                         \
3508     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3509       {                                                                 \
3510         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3511           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3512         else                                                            \
3513           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3514         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3515         break;                                                          \
3516       }                                                                 \
3517     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3518       {                                                                 \
3519         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3520         break;                                                          \
3521       }                                                                 \
3522     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3523       {                                                                 \
3524         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3525         break;                                                          \
3526       }                                                                 \
3527     else                                                                \
3528       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3529          must invoke it, or, at first, designate it to some graphic     \
3530          register.  Then repeat the loop to actually produce the        \
3531          character.  */                                                 \
3532       dst = encode_invocation_designation (charset, coding, dst,        \
3533                                            &produced_chars);            \
3534   } while (1)
3535
3536
3537 /* Produce codes for a DIMENSION2 character whose character set is
3538    CHARSET and whose position-codes are C1 and C2.  Designation and
3539    invocation codes are also produced in advance if necessary.  */
3540
3541 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3542   do {                                                                  \
3543     int id = CHARSET_ID (charset);                                      \
3544                                                                         \
3545     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3546         && id == charset_jisx0208)                                      \
3547       {                                                                 \
3548         id = charset_jisx0208_1978;                                     \
3549         charset = CHARSET_FROM_ID (id);                                 \
3550       }                                                                 \
3551                                                                         \
3552     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3553       {                                                                 \
3554         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3555           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3556         else                                                            \
3557           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3558         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3559         break;                                                          \
3560       }                                                                 \
3561     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3562       {                                                                 \
3563         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3564         break;                                                          \
3565       }                                                                 \
3566     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3567       {                                                                 \
3568         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3569         break;                                                          \
3570       }                                                                 \
3571     else                                                                \
3572       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3573          must invoke it, or, at first, designate it to some graphic     \
3574          register.  Then repeat the loop to actually produce the        \
3575          character.  */                                                 \
3576       dst = encode_invocation_designation (charset, coding, dst,        \
3577                                            &produced_chars);            \
3578   } while (1)
3579
3580
3581 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3582   do {                                                                     \
3583     int code = ENCODE_CHAR ((charset),(c));                                \
3584                                                                            \
3585     if (CHARSET_DIMENSION (charset) == 1)                                  \
3586       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3587     else                                                                   \
3588       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3589   } while (0)
3590
3591
3592 /* Produce designation and invocation codes at a place pointed by DST
3593    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3594    Return new DST.  */
3595
3596 unsigned char *
3597 encode_invocation_designation (charset, coding, dst, p_nchars)
3598      struct charset *charset;
3599      struct coding_system *coding;
3600      unsigned char *dst;
3601      int *p_nchars;
3602 {
3603   int multibytep = coding->dst_multibyte;
3604   int produced_chars = *p_nchars;
3605   int reg;                      /* graphic register number */
3606   int id = CHARSET_ID (charset);
3607
3608   /* At first, check designations.  */
3609   for (reg = 0; reg < 4; reg++)
3610     if (id == CODING_ISO_DESIGNATION (coding, reg))
3611       break;
3612
3613   if (reg >= 4)
3614     {
3615       /* CHARSET is not yet designated to any graphic registers.  */
3616       /* At first check the requested designation.  */
3617       reg = CODING_ISO_REQUEST (coding, id);
3618       if (reg < 0)
3619         /* Since CHARSET requests no special designation, designate it
3620            to graphic register 0.  */
3621         reg = 0;
3622
3623       ENCODE_DESIGNATION (charset, reg, coding);
3624     }
3625
3626   if (CODING_ISO_INVOCATION (coding, 0) != reg
3627       && CODING_ISO_INVOCATION (coding, 1) != reg)
3628     {
3629       /* Since the graphic register REG is not invoked to any graphic
3630          planes, invoke it to graphic plane 0.  */
3631       switch (reg)
3632         {
3633         case 0:                 /* graphic register 0 */
3634           ENCODE_SHIFT_IN;
3635           break;
3636
3637         case 1:                 /* graphic register 1 */
3638           ENCODE_SHIFT_OUT;
3639           break;
3640
3641         case 2:                 /* graphic register 2 */
3642           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3643             ENCODE_SINGLE_SHIFT_2;
3644           else
3645             ENCODE_LOCKING_SHIFT_2;
3646           break;
3647
3648         case 3:                 /* graphic register 3 */
3649           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3650             ENCODE_SINGLE_SHIFT_3;
3651           else
3652             ENCODE_LOCKING_SHIFT_3;
3653           break;
3654         }
3655     }
3656
3657   *p_nchars = produced_chars;
3658   return dst;
3659 }
3660
3661 /* The following three macros produce codes for indicating direction
3662    of text.  */
3663 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3664   do {                                                                  \
3665     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3666       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3667     else                                                                \
3668       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3669   } while (0)
3670
3671
3672 #define ENCODE_DIRECTION_R2L()                  \
3673   do {                                          \
3674     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3675     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3676   } while (0)
3677
3678
3679 #define ENCODE_DIRECTION_L2R()                  \
3680   do {                                          \
3681     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3682     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3683   } while (0)
3684
3685
3686 /* Produce codes for designation and invocation to reset the graphic
3687    planes and registers to initial state.  */
3688 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3689   do {                                                                  \
3690     int reg;                                                            \
3691     struct charset *charset;                                            \
3692                                                                         \
3693     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3694       ENCODE_SHIFT_IN;                                                  \
3695     for (reg = 0; reg < 4; reg++)                                       \
3696       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3697           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3698               != CODING_ISO_INITIAL (coding, reg)))                     \
3699         {                                                               \
3700           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3701           ENCODE_DESIGNATION (charset, reg, coding);                    \
3702         }                                                               \
3703   } while (0)
3704
3705
3706 /* Produce designation sequences of charsets in the line started from
3707    SRC to a place pointed by DST, and return updated DST.
3708
3709    If the current block ends before any end-of-line, we may fail to
3710    find all the necessary designations.  */
3711
3712 static unsigned char *
3713 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3714      struct coding_system *coding;
3715      int *charbuf, *charbuf_end;
3716      unsigned char *dst;
3717 {
3718   struct charset *charset;
3719   /* Table of charsets to be designated to each graphic register.  */
3720   int r[4];
3721   int c, found = 0, reg;
3722   int produced_chars = 0;
3723   int multibytep = coding->dst_multibyte;
3724   Lisp_Object attrs;
3725   Lisp_Object charset_list;
3726
3727   attrs = CODING_ID_ATTRS (coding->id);
3728   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3729   if (EQ (charset_list, Qiso_2022))
3730     charset_list = Viso_2022_charset_list;
3731
3732   for (reg = 0; reg < 4; reg++)
3733     r[reg] = -1;
3734
3735   while (found < 4)
3736     {
3737       int id;
3738
3739       c = *charbuf++;
3740       if (c == '\n')
3741         break;
3742       charset = char_charset (c, charset_list, NULL);
3743       id = CHARSET_ID (charset);
3744       reg = CODING_ISO_REQUEST (coding, id);
3745       if (reg >= 0 && r[reg] < 0)
3746         {
3747           found++;
3748           r[reg] = id;
3749         }
3750     }
3751
3752   if (found)
3753     {
3754       for (reg = 0; reg < 4; reg++)
3755         if (r[reg] >= 0
3756             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3757           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3758     }
3759
3760   return dst;
3761 }
3762
3763 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3764
3765 static int
3766 encode_coding_iso_2022 (coding)
3767      struct coding_system *coding;
3768 {
3769   int multibytep = coding->dst_multibyte;
3770   int *charbuf = coding->charbuf;
3771   int *charbuf_end = charbuf + coding->charbuf_used;
3772   unsigned char *dst = coding->destination + coding->produced;
3773   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3774   int safe_room = 16;
3775   int bol_designation
3776     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3777        && CODING_ISO_BOL (coding));
3778   int produced_chars = 0;
3779   Lisp_Object attrs, eol_type, charset_list;
3780   int ascii_compatible;
3781   int c;
3782   int preferred_charset_id = -1;
3783
3784   CODING_GET_INFO (coding, attrs, charset_list);
3785   eol_type = CODING_ID_EOL_TYPE (coding->id);
3786   if (VECTORP (eol_type))
3787     eol_type = Qunix;
3788
3789   setup_iso_safe_charsets (attrs);
3790   /* Charset list may have been changed.  */
3791   charset_list = CODING_ATTR_CHARSET_LIST (attrs);              \
3792   coding->safe_charsets = (char *) SDATA (CODING_ATTR_SAFE_CHARSETS(attrs));
3793
3794   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3795
3796   while (charbuf < charbuf_end)
3797     {
3798       ASSURE_DESTINATION (safe_room);
3799
3800       if (bol_designation)
3801         {
3802           unsigned char *dst_prev = dst;
3803
3804           /* We have to produce designation sequences if any now.  */
3805           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3806           bol_designation = 0;
3807           /* We are sure that designation sequences are all ASCII bytes.  */
3808           produced_chars += dst - dst_prev;
3809         }
3810
3811       c = *charbuf++;
3812
3813       if (c < 0)
3814         {
3815           /* Handle an annotation.  */
3816           switch (*charbuf)
3817             {
3818             case CODING_ANNOTATE_COMPOSITION_MASK:
3819               /* Not yet implemented.  */
3820               break;
3821             case CODING_ANNOTATE_CHARSET_MASK:
3822               preferred_charset_id = charbuf[3];
3823               if (preferred_charset_id >= 0
3824                   && NILP (Fmemq (make_number (preferred_charset_id),
3825                                   charset_list)))
3826                 preferred_charset_id = -1;
3827               break;
3828             default:
3829               abort ();
3830             }
3831           charbuf += -c - 1;
3832           continue;
3833         }
3834
3835       /* Now encode the character C.  */
3836       if (c < 0x20 || c == 0x7F)
3837         {
3838           if (c == '\n'
3839               || (c == '\r' && EQ (eol_type, Qmac)))
3840             {
3841               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3842                 ENCODE_RESET_PLANE_AND_REGISTER ();
3843               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3844                 {
3845                   int i;
3846
3847                   for (i = 0; i < 4; i++)
3848                     CODING_ISO_DESIGNATION (coding, i)
3849                       = CODING_ISO_INITIAL (coding, i);
3850                 }
3851               bol_designation
3852                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3853             }
3854           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3855             ENCODE_RESET_PLANE_AND_REGISTER ();
3856           EMIT_ONE_ASCII_BYTE (c);
3857         }
3858       else if (ASCII_CHAR_P (c))
3859         {
3860           if (ascii_compatible)
3861             EMIT_ONE_ASCII_BYTE (c);
3862           else
3863             {
3864               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3865               ENCODE_ISO_CHARACTER (charset, c);
3866             }
3867         }
3868       else if (CHAR_BYTE8_P (c))
3869         {
3870           c = CHAR_TO_BYTE8 (c);
3871           EMIT_ONE_BYTE (c);
3872         }
3873       else
3874         {
3875           struct charset *charset;
3876
3877           if (preferred_charset_id >= 0)
3878             {
3879               charset = CHARSET_FROM_ID (preferred_charset_id);
3880               if (! CHAR_CHARSET_P (c, charset))
3881                 charset = char_charset (c, charset_list, NULL);
3882             }
3883           else
3884             charset = char_charset (c, charset_list, NULL);
3885           if (!charset)
3886             {
3887               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3888                 {
3889                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3890                   charset = CHARSET_FROM_ID (charset_ascii);
3891                 }
3892               else
3893                 {
3894                   c = coding->default_char;
3895                   charset = char_charset (c, charset_list, NULL);
3896                 }
3897             }
3898           ENCODE_ISO_CHARACTER (charset, c);
3899         }
3900     }
3901
3902   if (coding->mode & CODING_MODE_LAST_BLOCK
3903       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3904     {
3905       ASSURE_DESTINATION (safe_room);
3906       ENCODE_RESET_PLANE_AND_REGISTER ();
3907     }
3908   record_conversion_result (coding, CODING_RESULT_SUCCESS);
3909   CODING_ISO_BOL (coding) = bol_designation;
3910   coding->produced_char += produced_chars;
3911   coding->produced = dst - coding->destination;
3912   return 0;
3913 }
3914
3915 \f
3916 /*** 8,9. SJIS and BIG5 handlers ***/
3917
3918 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3919    quite widely.  So, for the moment, Emacs supports them in the bare
3920    C code.  But, in the future, they may be supported only by CCL.  */
3921
3922 /* SJIS is a coding system encoding three character sets: ASCII, right
3923    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3924    as is.  A character of charset katakana-jisx0201 is encoded by
3925    "position-code + 0x80".  A character of charset japanese-jisx0208
3926    is encoded in 2-byte but two position-codes are divided and shifted
3927    so that it fit in the range below.
3928
3929    --- CODE RANGE of SJIS ---
3930    (character set)      (range)
3931    ASCII                0x00 .. 0x7F
3932    KATAKANA-JISX0201    0xA0 .. 0xDF
3933    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3934             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3935    -------------------------------
3936
3937 */
3938
3939 /* BIG5 is a coding system encoding two character sets: ASCII and
3940    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3941    character set and is encoded in two-byte.
3942
3943    --- CODE RANGE of BIG5 ---
3944    (character set)      (range)
3945    ASCII                0x00 .. 0x7F
3946    Big5 (1st byte)      0xA1 .. 0xFE
3947         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3948    --------------------------
3949
3950   */
3951
3952 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3953    Check if a text is encoded in SJIS.  If it is, return
3954    CATEGORY_MASK_SJIS, else return 0.  */
3955
3956 static int
3957 detect_coding_sjis (coding, detect_info)
3958      struct coding_system *coding;
3959      struct coding_detection_info *detect_info;
3960 {
3961   const unsigned char *src = coding->source, *src_base;
3962   const unsigned char *src_end = coding->source + coding->src_bytes;
3963   int multibytep = coding->src_multibyte;
3964   int consumed_chars = 0;
3965   int found = 0;
3966   int c;
3967
3968   detect_info->checked |= CATEGORY_MASK_SJIS;
3969   /* A coding system of this category is always ASCII compatible.  */
3970   src += coding->head_ascii;
3971
3972   while (1)
3973     {
3974       src_base = src;
3975       ONE_MORE_BYTE (c);
3976       if (c < 0x80)
3977         continue;
3978       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3979         {
3980           ONE_MORE_BYTE (c);
3981           if (c < 0x40 || c == 0x7F || c > 0xFC)
3982             break;
3983           found = CATEGORY_MASK_SJIS;
3984         }
3985       else if (c >= 0xA0 && c < 0xE0)
3986         found = CATEGORY_MASK_SJIS;
3987       else
3988         break;
3989     }
3990   detect_info->rejected |= CATEGORY_MASK_SJIS;
3991   return 0;
3992
3993  no_more_source:
3994   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
3995     {
3996       detect_info->rejected |= CATEGORY_MASK_SJIS;
3997       return 0;
3998     }
3999   detect_info->found |= found;
4000   return 1;
4001 }
4002
4003 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4004    Check if a text is encoded in BIG5.  If it is, return
4005    CATEGORY_MASK_BIG5, else return 0.  */
4006
4007 static int
4008 detect_coding_big5 (coding, detect_info)
4009      struct coding_system *coding;
4010      struct coding_detection_info *detect_info;
4011 {
4012   const unsigned char *src = coding->source, *src_base;
4013   const unsigned char *src_end = coding->source + coding->src_bytes;
4014   int multibytep = coding->src_multibyte;
4015   int consumed_chars = 0;
4016   int found = 0;
4017   int c;
4018
4019   detect_info->checked |= CATEGORY_MASK_BIG5;
4020   /* A coding system of this category is always ASCII compatible.  */
4021   src += coding->head_ascii;
4022
4023   while (1)
4024     {
4025       src_base = src;
4026       ONE_MORE_BYTE (c);
4027       if (c < 0x80)
4028         continue;
4029       if (c >= 0xA1)
4030         {
4031           ONE_MORE_BYTE (c);
4032           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
4033             return 0;
4034           found = CATEGORY_MASK_BIG5;
4035         }
4036       else
4037         break;
4038     }
4039   detect_info->rejected |= CATEGORY_MASK_BIG5;
4040   return 0;
4041
4042  no_more_source:
4043   if (src_base < src && coding->mode & CODING_MODE_LAST_BLOCK)
4044     {
4045       detect_info->rejected |= CATEGORY_MASK_BIG5;
4046       return 0;
4047     }
4048   detect_info->found |= found;
4049   return 1;
4050 }
4051
4052 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
4053    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
4054
4055 static void
4056 decode_coding_sjis (coding)
4057      struct coding_system *coding;
4058 {
4059   const unsigned char *src = coding->source + coding->consumed;
4060   const unsigned char *src_end = coding->source + coding->src_bytes;
4061   const unsigned char *src_base;
4062   int *charbuf = coding->charbuf + coding->charbuf_used;
4063   int *charbuf_end
4064     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4065   int consumed_chars = 0, consumed_chars_base;
4066   int multibytep = coding->src_multibyte;
4067   struct charset *charset_roman, *charset_kanji, *charset_kana;
4068   struct charset *charset_kanji2;
4069   Lisp_Object attrs, charset_list, val;
4070   int char_offset = coding->produced_char;
4071   int last_offset = char_offset;
4072   int last_id = charset_ascii;
4073
4074   CODING_GET_INFO (coding, attrs, charset_list);
4075
4076   val = charset_list;
4077   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4078   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4079   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4080   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4081
4082   while (1)
4083     {
4084       int c, c1;
4085       struct charset *charset;
4086
4087       src_base = src;
4088       consumed_chars_base = consumed_chars;
4089
4090       if (charbuf >= charbuf_end)
4091         break;
4092
4093       ONE_MORE_BYTE (c);
4094       if (c < 0)
4095         goto invalid_code;
4096       if (c < 0x80)
4097         charset = charset_roman;
4098       else if (c == 0x80 || c == 0xA0)
4099         goto invalid_code;
4100       else if (c >= 0xA1 && c <= 0xDF)
4101         {
4102           /* SJIS -> JISX0201-Kana */
4103           c &= 0x7F;
4104           charset = charset_kana;
4105         }
4106       else if (c <= 0xEF)
4107         {
4108           /* SJIS -> JISX0208 */
4109           ONE_MORE_BYTE (c1);
4110           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4111             goto invalid_code;
4112           c = (c << 8) | c1;
4113           SJIS_TO_JIS (c);
4114           charset = charset_kanji;
4115         }
4116       else if (c <= 0xFC && charset_kanji2)
4117         {
4118           /* SJIS -> JISX0213-2 */
4119           ONE_MORE_BYTE (c1);
4120           if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
4121             goto invalid_code;
4122           c = (c << 8) | c1;
4123           SJIS_TO_JIS2 (c);
4124           charset = charset_kanji2;
4125         }
4126       else
4127         goto invalid_code;
4128       if (charset->id != charset_ascii
4129           && last_id != charset->id)
4130         {
4131           if (last_id != charset_ascii)
4132             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4133           last_id = charset->id;
4134           last_offset = char_offset;
4135         }
4136       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4137       *charbuf++ = c;
4138       char_offset++;
4139       continue;
4140
4141     invalid_code:
4142       src = src_base;
4143       consumed_chars = consumed_chars_base;
4144       ONE_MORE_BYTE (c);
4145       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4146       char_offset++;
4147       coding->errors++;
4148     }
4149
4150  no_more_source:
4151   if (last_id != charset_ascii)
4152     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4153   coding->consumed_char += consumed_chars_base;
4154   coding->consumed = src_base - coding->source;
4155   coding->charbuf_used = charbuf - coding->charbuf;
4156 }
4157
4158 static void
4159 decode_coding_big5 (coding)
4160      struct coding_system *coding;
4161 {
4162   const unsigned char *src = coding->source + coding->consumed;
4163   const unsigned char *src_end = coding->source + coding->src_bytes;
4164   const unsigned char *src_base;
4165   int *charbuf = coding->charbuf + coding->charbuf_used;
4166   int *charbuf_end
4167     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4168   int consumed_chars = 0, consumed_chars_base;
4169   int multibytep = coding->src_multibyte;
4170   struct charset *charset_roman, *charset_big5;
4171   Lisp_Object attrs, charset_list, val;
4172   int char_offset = coding->produced_char;
4173   int last_offset = char_offset;
4174   int last_id = charset_ascii;
4175
4176   CODING_GET_INFO (coding, attrs, charset_list);
4177   val = charset_list;
4178   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4179   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4180
4181   while (1)
4182     {
4183       int c, c1;
4184       struct charset *charset;
4185
4186       src_base = src;
4187       consumed_chars_base = consumed_chars;
4188
4189       if (charbuf >= charbuf_end)
4190         break;
4191
4192       ONE_MORE_BYTE (c);
4193
4194       if (c < 0)
4195         goto invalid_code;
4196       if (c < 0x80)
4197         charset = charset_roman;
4198       else
4199         {
4200           /* BIG5 -> Big5 */
4201           if (c < 0xA1 || c > 0xFE)
4202             goto invalid_code;
4203           ONE_MORE_BYTE (c1);
4204           if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4205             goto invalid_code;
4206           c = c << 8 | c1;
4207           charset = charset_big5;
4208         }
4209       if (charset->id != charset_ascii
4210           && last_id != charset->id)
4211         {
4212           if (last_id != charset_ascii)
4213             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4214           last_id = charset->id;
4215           last_offset = char_offset;
4216         }
4217       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4218       *charbuf++ = c;
4219       char_offset++;
4220       continue;
4221
4222     invalid_code:
4223       src = src_base;
4224       consumed_chars = consumed_chars_base;
4225       ONE_MORE_BYTE (c);
4226       *charbuf++ = c < 0 ? -c : BYTE8_TO_CHAR (c);
4227       char_offset++;
4228       coding->errors++;
4229     }
4230
4231  no_more_source:
4232   if (last_id != charset_ascii)
4233     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4234   coding->consumed_char += consumed_chars_base;
4235   coding->consumed = src_base - coding->source;
4236   coding->charbuf_used = charbuf - coding->charbuf;
4237 }
4238
4239 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4240    This function can encode charsets `ascii', `katakana-jisx0201',
4241    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
4242    are sure that all these charsets are registered as official charset
4243    (i.e. do not have extended leading-codes).  Characters of other
4244    charsets are produced without any encoding.  If SJIS_P is 1, encode
4245    SJIS text, else encode BIG5 text.  */
4246
4247 static int
4248 encode_coding_sjis (coding)
4249      struct coding_system *coding;
4250 {
4251   int multibytep = coding->dst_multibyte;
4252   int *charbuf = coding->charbuf;
4253   int *charbuf_end = charbuf + coding->charbuf_used;
4254   unsigned char *dst = coding->destination + coding->produced;
4255   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4256   int safe_room = 4;
4257   int produced_chars = 0;
4258   Lisp_Object attrs, charset_list, val;
4259   int ascii_compatible;
4260   struct charset *charset_roman, *charset_kanji, *charset_kana;
4261   struct charset *charset_kanji2;
4262   int c;
4263
4264   CODING_GET_INFO (coding, attrs, charset_list);
4265   val = charset_list;
4266   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4267   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4268   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4269   charset_kanji2 = NILP (val) ? NULL : CHARSET_FROM_ID (XINT (XCAR (val)));
4270
4271   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4272
4273   while (charbuf < charbuf_end)
4274     {
4275       ASSURE_DESTINATION (safe_room);
4276       c = *charbuf++;
4277       /* Now encode the character C.  */
4278       if (ASCII_CHAR_P (c) && ascii_compatible)
4279         EMIT_ONE_ASCII_BYTE (c);
4280       else if (CHAR_BYTE8_P (c))
4281         {
4282           c = CHAR_TO_BYTE8 (c);
4283           EMIT_ONE_BYTE (c);
4284         }
4285       else
4286         {
4287           unsigned code;
4288           struct charset *charset = char_charset (c, charset_list, &code);
4289
4290           if (!charset)
4291             {
4292               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4293                 {
4294                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4295                   charset = CHARSET_FROM_ID (charset_ascii);
4296                 }
4297               else
4298                 {
4299                   c = coding->default_char;
4300                   charset = char_charset (c, charset_list, &code);
4301                 }
4302             }
4303           if (code == CHARSET_INVALID_CODE (charset))
4304             abort ();
4305           if (charset == charset_kanji)
4306             {
4307               int c1, c2;
4308               JIS_TO_SJIS (code);
4309               c1 = code >> 8, c2 = code & 0xFF;
4310               EMIT_TWO_BYTES (c1, c2);
4311             }
4312           else if (charset == charset_kana)
4313             EMIT_ONE_BYTE (code | 0x80);
4314           else if (charset_kanji2 && charset == charset_kanji2)
4315             {
4316               int c1, c2;
4317
4318               c1 = code >> 8;
4319               if (c1 == 0x21 || (c1 >= 0x23 && c1 < 0x25)
4320                   || (c1 >= 0x2C && c1 <= 0x2F) || c1 >= 0x6E)
4321                 {
4322                   JIS_TO_SJIS2 (code);
4323                   c1 = code >> 8, c2 = code & 0xFF;
4324                   EMIT_TWO_BYTES (c1, c2);
4325                 }
4326               else
4327                 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4328             }
4329           else
4330             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4331         }
4332     }
4333   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4334   coding->produced_char += produced_chars;
4335   coding->produced = dst - coding->destination;
4336   return 0;
4337 }
4338
4339 static int
4340 encode_coding_big5 (coding)
4341      struct coding_system *coding;
4342 {
4343   int multibytep = coding->dst_multibyte;
4344   int *charbuf = coding->charbuf;
4345   int *charbuf_end = charbuf + coding->charbuf_used;
4346   unsigned char *dst = coding->destination + coding->produced;
4347   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4348   int safe_room = 4;
4349   int produced_chars = 0;
4350   Lisp_Object attrs, charset_list, val;
4351   int ascii_compatible;
4352   struct charset *charset_roman, *charset_big5;
4353   int c;
4354
4355   CODING_GET_INFO (coding, attrs, charset_list);
4356   val = charset_list;
4357   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4358   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4359   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4360
4361   while (charbuf < charbuf_end)
4362     {
4363       ASSURE_DESTINATION (safe_room);
4364       c = *charbuf++;
4365       /* Now encode the character C.  */
4366       if (ASCII_CHAR_P (c) && ascii_compatible)
4367         EMIT_ONE_ASCII_BYTE (c);
4368       else if (CHAR_BYTE8_P (c))
4369         {
4370           c = CHAR_TO_BYTE8 (c);
4371           EMIT_ONE_BYTE (c);
4372         }
4373       else
4374         {
4375           unsigned code;
4376           struct charset *charset = char_charset (c, charset_list, &code);
4377
4378           if (! charset)
4379             {
4380               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4381                 {
4382                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4383                   charset = CHARSET_FROM_ID (charset_ascii);
4384                 }
4385               else
4386                 {
4387                   c = coding->default_char;
4388                   charset = char_charset (c, charset_list, &code);
4389                 }
4390             }
4391           if (code == CHARSET_INVALID_CODE (charset))
4392             abort ();
4393           if (charset == charset_big5)
4394             {
4395               int c1, c2;
4396
4397               c1 = code >> 8, c2 = code & 0xFF;
4398               EMIT_TWO_BYTES (c1, c2);
4399             }
4400           else
4401             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4402         }
4403     }
4404   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4405   coding->produced_char += produced_chars;
4406   coding->produced = dst - coding->destination;
4407   return 0;
4408 }
4409
4410 \f
4411 /*** 10. CCL handlers ***/
4412
4413 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4414    Check if a text is encoded in a coding system of which
4415    encoder/decoder are written in CCL program.  If it is, return
4416    CATEGORY_MASK_CCL, else return 0.  */
4417
4418 static int
4419 detect_coding_ccl (coding, detect_info)
4420      struct coding_system *coding;
4421      struct coding_detection_info *detect_info;
4422 {
4423   const unsigned char *src = coding->source, *src_base;
4424   const unsigned char *src_end = coding->source + coding->src_bytes;
4425   int multibytep = coding->src_multibyte;
4426   int consumed_chars = 0;
4427   int found = 0;
4428   unsigned char *valids;
4429   int head_ascii = coding->head_ascii;
4430   Lisp_Object attrs;
4431
4432   detect_info->checked |= CATEGORY_MASK_CCL;
4433
4434   coding = &coding_categories[coding_category_ccl];
4435   valids = CODING_CCL_VALIDS (coding);
4436   attrs = CODING_ID_ATTRS (coding->id);
4437   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4438     src += head_ascii;
4439
4440   while (1)
4441     {
4442       int c;
4443
4444       src_base = src;
4445       ONE_MORE_BYTE (c);
4446       if (c < 0 || ! valids[c])
4447         break;
4448       if ((valids[c] > 1))
4449         found = CATEGORY_MASK_CCL;
4450     }
4451   detect_info->rejected |= CATEGORY_MASK_CCL;
4452   return 0;
4453
4454  no_more_source:
4455   detect_info->found |= found;
4456   return 1;
4457 }
4458
4459 static void
4460 decode_coding_ccl (coding)
4461      struct coding_system *coding;
4462 {
4463   const unsigned char *src = coding->source + coding->consumed;
4464   const unsigned char *src_end = coding->source + coding->src_bytes;
4465   int *charbuf = coding->charbuf + coding->charbuf_used;
4466   int *charbuf_end = coding->charbuf + coding->charbuf_size;
4467   int consumed_chars = 0;
4468   int multibytep = coding->src_multibyte;
4469   struct ccl_program ccl;
4470   int source_charbuf[1024];
4471   int source_byteidx[1024];
4472   Lisp_Object attrs, charset_list;
4473
4474   CODING_GET_INFO (coding, attrs, charset_list);
4475   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4476
4477   while (src < src_end)
4478     {
4479       const unsigned char *p = src;
4480       int *source, *source_end;
4481       int i = 0;
4482
4483       if (multibytep)
4484         while (i < 1024 && p < src_end)
4485           {
4486             source_byteidx[i] = p - src;
4487             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4488           }
4489       else
4490         while (i < 1024 && p < src_end)
4491           source_charbuf[i++] = *p++;
4492
4493       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4494         ccl.last_block = 1;
4495
4496       source = source_charbuf;
4497       source_end = source + i;
4498       while (source < source_end)
4499         {
4500           ccl_driver (&ccl, source, charbuf,
4501                       source_end - source, charbuf_end - charbuf,
4502                       charset_list);
4503           source += ccl.consumed;
4504           charbuf += ccl.produced;
4505           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4506             break;
4507         }
4508       if (source < source_end)
4509         src += source_byteidx[source - source_charbuf];
4510       else
4511         src = p;
4512       consumed_chars += source - source_charbuf;
4513
4514       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4515           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4516         break;
4517     }
4518
4519   switch (ccl.status)
4520     {
4521     case CCL_STAT_SUSPEND_BY_SRC:
4522       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4523       break;
4524     case CCL_STAT_SUSPEND_BY_DST:
4525       break;
4526     case CCL_STAT_QUIT:
4527     case CCL_STAT_INVALID_CMD:
4528       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4529       break;
4530     default:
4531       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4532       break;
4533     }
4534   coding->consumed_char += consumed_chars;
4535   coding->consumed = src - coding->source;
4536   coding->charbuf_used = charbuf - coding->charbuf;
4537 }
4538
4539 static int
4540 encode_coding_ccl (coding)
4541      struct coding_system *coding;
4542 {
4543   struct ccl_program ccl;
4544   int multibytep = coding->dst_multibyte;
4545   int *charbuf = coding->charbuf;
4546   int *charbuf_end = charbuf + coding->charbuf_used;
4547   unsigned char *dst = coding->destination + coding->produced;
4548   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4549   unsigned char *adjusted_dst_end = dst_end - 1;
4550   int destination_charbuf[1024];
4551   int i, produced_chars = 0;
4552   Lisp_Object attrs, charset_list;
4553
4554   CODING_GET_INFO (coding, attrs, charset_list);
4555   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4556
4557   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4558   ccl.dst_multibyte = coding->dst_multibyte;
4559
4560   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4561     {
4562       int dst_bytes = dst_end - dst;
4563       if (dst_bytes > 1024)
4564         dst_bytes = 1024;
4565
4566       ccl_driver (&ccl, charbuf, destination_charbuf,
4567                   charbuf_end - charbuf, dst_bytes, charset_list);
4568       charbuf += ccl.consumed;
4569       if (multibytep)
4570         for (i = 0; i < ccl.produced; i++)
4571           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4572       else
4573         {
4574           for (i = 0; i < ccl.produced; i++)
4575             *dst++ = destination_charbuf[i] & 0xFF;
4576           produced_chars += ccl.produced;
4577         }
4578     }
4579
4580   switch (ccl.status)
4581     {
4582     case CCL_STAT_SUSPEND_BY_SRC:
4583       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_SRC);
4584       break;
4585     case CCL_STAT_SUSPEND_BY_DST:
4586       record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_DST);
4587       break;
4588     case CCL_STAT_QUIT:
4589     case CCL_STAT_INVALID_CMD:
4590       record_conversion_result (coding, CODING_RESULT_INTERRUPT);
4591       break;
4592     default:
4593       record_conversion_result (coding, CODING_RESULT_SUCCESS);
4594       break;
4595     }
4596
4597   coding->produced_char += produced_chars;
4598   coding->produced = dst - coding->destination;
4599   return 0;
4600 }
4601
4602
4603 \f
4604 /*** 10, 11. no-conversion handlers ***/
4605
4606 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4607
4608 static void
4609 decode_coding_raw_text (coding)
4610      struct coding_system *coding;
4611 {
4612   coding->chars_at_source = 1;
4613   coding->consumed_char = 0;
4614   coding->consumed = 0;
4615   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4616 }
4617
4618 static int
4619 encode_coding_raw_text (coding)
4620      struct coding_system *coding;
4621 {
4622   int multibytep = coding->dst_multibyte;
4623   int *charbuf = coding->charbuf;
4624   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4625   unsigned char *dst = coding->destination + coding->produced;
4626   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4627   int produced_chars = 0;
4628   int c;
4629
4630   if (multibytep)
4631     {
4632       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4633
4634       if (coding->src_multibyte)
4635         while (charbuf < charbuf_end)
4636           {
4637             ASSURE_DESTINATION (safe_room);
4638             c = *charbuf++;
4639             if (ASCII_CHAR_P (c))
4640               EMIT_ONE_ASCII_BYTE (c);
4641             else if (CHAR_BYTE8_P (c))
4642               {
4643                 c = CHAR_TO_BYTE8 (c);
4644                 EMIT_ONE_BYTE (c);
4645               }
4646             else
4647               {
4648                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4649
4650                 CHAR_STRING_ADVANCE (c, p1);
4651                 while (p0 < p1)
4652                   {
4653                     EMIT_ONE_BYTE (*p0);
4654                     p0++;
4655                   }
4656               }
4657           }
4658       else
4659         while (charbuf < charbuf_end)
4660           {
4661             ASSURE_DESTINATION (safe_room);
4662             c = *charbuf++;
4663             EMIT_ONE_BYTE (c);
4664           }
4665     }
4666   else
4667     {
4668       if (coding->src_multibyte)
4669         {
4670           int safe_room = MAX_MULTIBYTE_LENGTH;
4671
4672           while (charbuf < charbuf_end)
4673             {
4674               ASSURE_DESTINATION (safe_room);
4675               c = *charbuf++;
4676               if (ASCII_CHAR_P (c))
4677                 *dst++ = c;
4678               else if (CHAR_BYTE8_P (c))
4679                 *dst++ = CHAR_TO_BYTE8 (c);
4680               else
4681                 CHAR_STRING_ADVANCE (c, dst);
4682               produced_chars++;
4683             }
4684         }
4685       else
4686         {
4687           ASSURE_DESTINATION (charbuf_end - charbuf);
4688           while (charbuf < charbuf_end && dst < dst_end)
4689             *dst++ = *charbuf++;
4690           produced_chars = dst - (coding->destination + coding->dst_bytes);
4691         }
4692     }
4693   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4694   coding->produced_char += produced_chars;
4695   coding->produced = dst - coding->destination;
4696   return 0;
4697 }
4698
4699 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4700    Check if a text is encoded in a charset-based coding system.  If it
4701    is, return 1, else return 0.  */
4702
4703 static int
4704 detect_coding_charset (coding, detect_info)
4705      struct coding_system *coding;
4706      struct coding_detection_info *detect_info;
4707 {
4708   const unsigned char *src = coding->source, *src_base;
4709   const unsigned char *src_end = coding->source + coding->src_bytes;
4710   int multibytep = coding->src_multibyte;
4711   int consumed_chars = 0;
4712   Lisp_Object attrs, valids;
4713   int found = 0;
4714
4715   detect_info->checked |= CATEGORY_MASK_CHARSET;
4716
4717   coding = &coding_categories[coding_category_charset];
4718   attrs = CODING_ID_ATTRS (coding->id);
4719   valids = AREF (attrs, coding_attr_charset_valids);
4720
4721   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4722     src += coding->head_ascii;
4723
4724   while (1)
4725     {
4726       int c;
4727
4728       src_base = src;
4729       ONE_MORE_BYTE (c);
4730       if (c < 0)
4731         continue;
4732       if (NILP (AREF (valids, c)))
4733         break;
4734       if (c >= 0x80)
4735         found = CATEGORY_MASK_CHARSET;
4736     }
4737   detect_info->rejected |= CATEGORY_MASK_CHARSET;
4738   return 0;
4739
4740  no_more_source:
4741   detect_info->found |= found;
4742   return 1;
4743 }
4744
4745 static void
4746 decode_coding_charset (coding)
4747      struct coding_system *coding;
4748 {
4749   const unsigned char *src = coding->source + coding->consumed;
4750   const unsigned char *src_end = coding->source + coding->src_bytes;
4751   const unsigned char *src_base;
4752   int *charbuf = coding->charbuf + coding->charbuf_used;
4753   int *charbuf_end
4754     = coding->charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4755   int consumed_chars = 0, consumed_chars_base;
4756   int multibytep = coding->src_multibyte;
4757   Lisp_Object attrs, charset_list, valids;
4758   int char_offset = coding->produced_char;
4759   int last_offset = char_offset;
4760   int last_id = charset_ascii;
4761
4762   CODING_GET_INFO (coding, attrs, charset_list);
4763   valids = AREF (attrs, coding_attr_charset_valids);
4764
4765   while (1)
4766     {
4767       int c;
4768       Lisp_Object val;
4769       struct charset *charset;
4770       int dim;
4771       int len = 1;
4772       unsigned code;
4773
4774       src_base = src;
4775       consumed_chars_base = consumed_chars;
4776
4777       if (charbuf >= charbuf_end)
4778         break;
4779
4780       ONE_MORE_BYTE (c);
4781       if (c < 0)
4782         goto invalid_code;
4783       code = c;
4784
4785       val = AREF (valids, c);
4786       if (NILP (val))
4787         goto invalid_code;
4788       if (INTEGERP (val))
4789         {
4790           charset = CHARSET_FROM_ID (XFASTINT (val));
4791           dim = CHARSET_DIMENSION (charset);
4792           while (len < dim)
4793             {
4794               ONE_MORE_BYTE (c);
4795               code = (code << 8) | c;
4796               len++;
4797             }
4798           CODING_DECODE_CHAR (coding, src, src_base, src_end,
4799                               charset, code, c);
4800         }
4801       else
4802         {
4803           /* VAL is a list of charset IDs.  It is assured that the
4804              list is sorted by charset dimensions (smaller one
4805              comes first).  */
4806           while (CONSP (val))
4807             {
4808               charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4809               dim = CHARSET_DIMENSION (charset);
4810               while (len < dim)
4811                 {
4812                   ONE_MORE_BYTE (c);
4813                   code = (code << 8) | c;
4814                   len++;
4815                 }
4816               CODING_DECODE_CHAR (coding, src, src_base,
4817                                   src_end, charset, code, c);
4818               if (c >= 0)
4819                 break;
4820               val = XCDR (val);
4821             }
4822         }
4823       if (c < 0)
4824         goto invalid_code;
4825       if (charset->id != charset_ascii
4826           && last_id != charset->id)
4827         {
4828           if (last_id != charset_ascii)
4829             ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4830           last_id = charset->id;
4831           last_offset = char_offset;
4832         }
4833
4834       *charbuf++ = c;
4835       char_offset++;
4836       continue;
4837
4838     invalid_code:
4839       src = src_base;
4840       consumed_chars = consumed_chars_base;
4841       ONE_MORE_BYTE (c);
4842       *charbuf++ = c < 0 ? -c : ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4843       char_offset++;
4844       coding->errors++;
4845     }
4846
4847  no_more_source:
4848   if (last_id != charset_ascii)
4849     ADD_CHARSET_DATA (charbuf, char_offset - last_offset, last_id);
4850   coding->consumed_char += consumed_chars_base;
4851   coding->consumed = src_base - coding->source;
4852   coding->charbuf_used = charbuf - coding->charbuf;
4853 }
4854
4855 static int
4856 encode_coding_charset (coding)
4857      struct coding_system *coding;
4858 {
4859   int multibytep = coding->dst_multibyte;
4860   int *charbuf = coding->charbuf;
4861   int *charbuf_end = charbuf + coding->charbuf_used;
4862   unsigned char *dst = coding->destination + coding->produced;
4863   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4864   int safe_room = MAX_MULTIBYTE_LENGTH;
4865   int produced_chars = 0;
4866   Lisp_Object attrs, charset_list;
4867   int ascii_compatible;
4868   int c;
4869
4870   CODING_GET_INFO (coding, attrs, charset_list);
4871   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4872
4873   while (charbuf < charbuf_end)
4874     {
4875       struct charset *charset;
4876       unsigned code;
4877
4878       ASSURE_DESTINATION (safe_room);
4879       c = *charbuf++;
4880       if (ascii_compatible && ASCII_CHAR_P (c))
4881         EMIT_ONE_ASCII_BYTE (c);
4882       else if (CHAR_BYTE8_P (c))
4883         {
4884           c = CHAR_TO_BYTE8 (c);
4885           EMIT_ONE_BYTE (c);
4886         }
4887       else
4888         {
4889           charset = char_charset (c, charset_list, &code);
4890           if (charset)
4891             {
4892               if (CHARSET_DIMENSION (charset) == 1)
4893                 EMIT_ONE_BYTE (code);
4894               else if (CHARSET_DIMENSION (charset) == 2)
4895                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4896               else if (CHARSET_DIMENSION (charset) == 3)
4897                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4898               else
4899                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4900                                  (code >> 8) & 0xFF, code & 0xFF);
4901             }
4902           else
4903             {
4904               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4905                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4906               else
4907                 c = coding->default_char;
4908               EMIT_ONE_BYTE (c);
4909             }
4910         }
4911     }
4912
4913   record_conversion_result (coding, CODING_RESULT_SUCCESS);
4914   coding->produced_char += produced_chars;
4915   coding->produced = dst - coding->destination;
4916   return 0;
4917 }
4918
4919 \f
4920 /*** 7. C library functions ***/
4921
4922 /* Setup coding context CODING from information about CODING_SYSTEM.
4923    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4924    CODING_SYSTEM is invalid, signal an error.  */
4925
4926 void
4927 setup_coding_system (coding_system, coding)
4928      Lisp_Object coding_system;
4929      struct coding_system *coding;
4930 {
4931   Lisp_Object attrs;
4932   Lisp_Object eol_type;
4933   Lisp_Object coding_type;
4934   Lisp_Object val;
4935
4936   if (NILP (coding_system))
4937     coding_system = Qno_conversion;
4938
4939   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4940
4941   attrs = CODING_ID_ATTRS (coding->id);
4942   eol_type = CODING_ID_EOL_TYPE (coding->id);
4943
4944   coding->mode = 0;
4945   coding->head_ascii = -1;
4946   coding->common_flags
4947     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4948   if (! NILP (CODING_ATTR_POST_READ (attrs)))
4949     coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4950   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4951     coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4952   if (! NILP (CODING_ATTR_FOR_UNIBYTE (attrs)))
4953     coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4954
4955   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4956   coding->max_charset_id = SCHARS (val) - 1;
4957   coding->safe_charsets = (char *) SDATA (val);
4958   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4959
4960   coding_type = CODING_ATTR_TYPE (attrs);
4961   if (EQ (coding_type, Qundecided))
4962     {
4963       coding->detector = NULL;
4964       coding->decoder = decode_coding_raw_text;
4965       coding->encoder = encode_coding_raw_text;
4966       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4967     }
4968   else if (EQ (coding_type, Qiso_2022))
4969     {
4970       int i;
4971       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4972
4973       /* Invoke graphic register 0 to plane 0.  */
4974       CODING_ISO_INVOCATION (coding, 0) = 0;
4975       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4976       CODING_ISO_INVOCATION (coding, 1)
4977         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4978       /* Setup the initial status of designation.  */
4979       for (i = 0; i < 4; i++)
4980         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4981       /* Not single shifting initially.  */
4982       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4983       /* Beginning of buffer should also be regarded as bol. */
4984       CODING_ISO_BOL (coding) = 1;
4985       coding->detector = detect_coding_iso_2022;
4986       coding->decoder = decode_coding_iso_2022;
4987       coding->encoder = encode_coding_iso_2022;
4988       if (flags & CODING_ISO_FLAG_SAFE)
4989         coding->mode |= CODING_MODE_SAFE_ENCODING;
4990       coding->common_flags
4991         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4992             | CODING_REQUIRE_FLUSHING_MASK);
4993       if (flags & CODING_ISO_FLAG_COMPOSITION)
4994         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4995       if (flags & CODING_ISO_FLAG_DESIGNATION)
4996         coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4997       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4998         {
4999           setup_iso_safe_charsets (attrs);
5000           val = CODING_ATTR_SAFE_CHARSETS (attrs);
5001           coding->max_charset_id = SCHARS (val) - 1;
5002           coding->safe_charsets = (char *) SDATA (val);
5003         }
5004       CODING_ISO_FLAGS (coding) = flags;
5005     }
5006   else if (EQ (coding_type, Qcharset))
5007     {
5008       coding->detector = detect_coding_charset;
5009       coding->decoder = decode_coding_charset;
5010       coding->encoder = encode_coding_charset;
5011       coding->common_flags
5012         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5013     }
5014   else if (EQ (coding_type, Qutf_8))
5015     {
5016       coding->detector = detect_coding_utf_8;
5017       coding->decoder = decode_coding_utf_8;
5018       coding->encoder = encode_coding_utf_8;
5019       coding->common_flags
5020         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5021     }
5022   else if (EQ (coding_type, Qutf_16))
5023     {
5024       val = AREF (attrs, coding_attr_utf_16_bom);
5025       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
5026                                     : EQ (val, Qt) ? utf_16_with_bom
5027                                     : utf_16_without_bom);
5028       val = AREF (attrs, coding_attr_utf_16_endian);
5029       CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
5030                                        : utf_16_little_endian);
5031       CODING_UTF_16_SURROGATE (coding) = 0;
5032       coding->detector = detect_coding_utf_16;
5033       coding->decoder = decode_coding_utf_16;
5034       coding->encoder = encode_coding_utf_16;
5035       coding->common_flags
5036         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5037       if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
5038         coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
5039     }
5040   else if (EQ (coding_type, Qccl))
5041     {
5042       coding->detector = detect_coding_ccl;
5043       coding->decoder = decode_coding_ccl;
5044       coding->encoder = encode_coding_ccl;
5045       coding->common_flags
5046         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
5047             | CODING_REQUIRE_FLUSHING_MASK);
5048     }
5049   else if (EQ (coding_type, Qemacs_mule))
5050     {
5051       coding->detector = detect_coding_emacs_mule;
5052       coding->decoder = decode_coding_emacs_mule;
5053       coding->encoder = encode_coding_emacs_mule;
5054       coding->common_flags
5055         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5056       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
5057           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
5058         {
5059           Lisp_Object tail, safe_charsets;
5060           int max_charset_id = 0;
5061
5062           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5063                tail = XCDR (tail))
5064             if (max_charset_id < XFASTINT (XCAR (tail)))
5065               max_charset_id = XFASTINT (XCAR (tail));
5066           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
5067                                         make_number (255));
5068           for (tail = Vemacs_mule_charset_list; CONSP (tail);
5069                tail = XCDR (tail))
5070             SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
5071           coding->max_charset_id = max_charset_id;
5072           coding->safe_charsets = (char *) SDATA (safe_charsets);
5073         }
5074     }
5075   else if (EQ (coding_type, Qshift_jis))
5076     {
5077       coding->detector = detect_coding_sjis;
5078       coding->decoder = decode_coding_sjis;
5079       coding->encoder = encode_coding_sjis;
5080       coding->common_flags
5081         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5082     }
5083   else if (EQ (coding_type, Qbig5))
5084     {
5085       coding->detector = detect_coding_big5;
5086       coding->decoder = decode_coding_big5;
5087       coding->encoder = encode_coding_big5;
5088       coding->common_flags
5089         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
5090     }
5091   else                          /* EQ (coding_type, Qraw_text) */
5092     {
5093       coding->detector = NULL;
5094       coding->decoder = decode_coding_raw_text;
5095       coding->encoder = encode_coding_raw_text;
5096       if (! EQ (eol_type, Qunix))
5097         {
5098           coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
5099           if (! VECTORP (eol_type))
5100             coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
5101         }
5102
5103     }
5104
5105   return;
5106 }
5107
5108 /* Return raw-text or one of its subsidiaries that has the same
5109    eol_type as CODING-SYSTEM.  */
5110
5111 Lisp_Object
5112 raw_text_coding_system (coding_system)
5113      Lisp_Object coding_system;
5114 {
5115   Lisp_Object spec, attrs;
5116   Lisp_Object eol_type, raw_text_eol_type;
5117
5118   if (NILP (coding_system))
5119     return Qraw_text;
5120   spec = CODING_SYSTEM_SPEC (coding_system);
5121   attrs = AREF (spec, 0);
5122
5123   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
5124     return coding_system;
5125
5126   eol_type = AREF (spec, 2);
5127   if (VECTORP (eol_type))
5128     return Qraw_text;
5129   spec = CODING_SYSTEM_SPEC (Qraw_text);
5130   raw_text_eol_type = AREF (spec, 2);
5131   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5132           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5133           : AREF (raw_text_eol_type, 2));
5134 }
5135
5136
5137 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5138    does, return one of the subsidiary that has the same eol-spec as
5139    PARENT.  Otherwise, return CODING_SYSTEM.  */
5140
5141 Lisp_Object
5142 coding_inherit_eol_type (coding_system, parent)
5143      Lisp_Object coding_system, parent;
5144 {
5145   Lisp_Object spec, eol_type;
5146
5147   if (NILP (coding_system))
5148     coding_system = Qraw_text;
5149   spec = CODING_SYSTEM_SPEC (coding_system);
5150   eol_type = AREF (spec, 2);
5151   if (VECTORP (eol_type)
5152       && ! NILP (parent))
5153     {
5154       Lisp_Object parent_spec;
5155       Lisp_Object parent_eol_type;
5156
5157       parent_spec
5158         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5159       parent_eol_type = AREF (parent_spec, 2);
5160       if (EQ (parent_eol_type, Qunix))
5161         coding_system = AREF (eol_type, 0);
5162       else if (EQ (parent_eol_type, Qdos))
5163         coding_system = AREF (eol_type, 1);
5164       else if (EQ (parent_eol_type, Qmac))
5165         coding_system = AREF (eol_type, 2);
5166     }
5167   return coding_system;
5168 }
5169
5170 /* Emacs has a mechanism to automatically detect a coding system if it
5171    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
5172    it's impossible to distinguish some coding systems accurately
5173    because they use the same range of codes.  So, at first, coding
5174    systems are categorized into 7, those are:
5175
5176    o coding-category-emacs-mule
5177
5178         The category for a coding system which has the same code range
5179         as Emacs' internal format.  Assigned the coding-system (Lisp
5180         symbol) `emacs-mule' by default.
5181
5182    o coding-category-sjis
5183
5184         The category for a coding system which has the same code range
5185         as SJIS.  Assigned the coding-system (Lisp
5186         symbol) `japanese-shift-jis' by default.
5187
5188    o coding-category-iso-7
5189
5190         The category for a coding system which has the same code range
5191         as ISO2022 of 7-bit environment.  This doesn't use any locking
5192         shift and single shift functions.  This can encode/decode all
5193         charsets.  Assigned the coding-system (Lisp symbol)
5194         `iso-2022-7bit' by default.
5195
5196    o coding-category-iso-7-tight
5197
5198         Same as coding-category-iso-7 except that this can
5199         encode/decode only the specified charsets.
5200
5201    o coding-category-iso-8-1
5202
5203         The category for a coding system which has the same code range
5204         as ISO2022 of 8-bit environment and graphic plane 1 used only
5205         for DIMENSION1 charset.  This doesn't use any locking shift
5206         and single shift functions.  Assigned the coding-system (Lisp
5207         symbol) `iso-latin-1' by default.
5208
5209    o coding-category-iso-8-2
5210
5211         The category for a coding system which has the same code range
5212         as ISO2022 of 8-bit environment and graphic plane 1 used only
5213         for DIMENSION2 charset.  This doesn't use any locking shift
5214         and single shift functions.  Assigned the coding-system (Lisp
5215         symbol) `japanese-iso-8bit' by default.
5216
5217    o coding-category-iso-7-else
5218
5219         The category for a coding system which has the same code range
5220         as ISO2022 of 7-bit environemnt but uses locking shift or
5221         single shift functions.  Assigned the coding-system (Lisp
5222         symbol) `iso-2022-7bit-lock' by default.
5223
5224    o coding-category-iso-8-else
5225
5226         The category for a coding system which has the same code range
5227         as ISO2022 of 8-bit environemnt but uses locking shift or
5228         single shift functions.  Assigned the coding-system (Lisp
5229         symbol) `iso-2022-8bit-ss2' by default.
5230
5231    o coding-category-big5
5232
5233         The category for a coding system which has the same code range
5234         as BIG5.  Assigned the coding-system (Lisp symbol)
5235         `cn-big5' by default.
5236
5237    o coding-category-utf-8
5238
5239         The category for a coding system which has the same code range
5240         as UTF-8 (cf. RFC3629).  Assigned the coding-system (Lisp
5241         symbol) `utf-8' by default.
5242
5243    o coding-category-utf-16-be
5244
5245         The category for a coding system in which a text has an
5246         Unicode signature (cf. Unicode Standard) in the order of BIG
5247         endian at the head.  Assigned the coding-system (Lisp symbol)
5248         `utf-16-be' by default.
5249
5250    o coding-category-utf-16-le
5251
5252         The category for a coding system in which a text has an
5253         Unicode signature (cf. Unicode Standard) in the order of
5254         LITTLE endian at the head.  Assigned the coding-system (Lisp
5255         symbol) `utf-16-le' by default.
5256
5257    o coding-category-ccl
5258
5259         The category for a coding system of which encoder/decoder is
5260         written in CCL programs.  The default value is nil, i.e., no
5261         coding system is assigned.
5262
5263    o coding-category-binary
5264
5265         The category for a coding system not categorized in any of the
5266         above.  Assigned the coding-system (Lisp symbol)
5267         `no-conversion' by default.
5268
5269    Each of them is a Lisp symbol and the value is an actual
5270    `coding-system's (this is also a Lisp symbol) assigned by a user.
5271    What Emacs does actually is to detect a category of coding system.
5272    Then, it uses a `coding-system' assigned to it.  If Emacs can't
5273    decide only one possible category, it selects a category of the
5274    highest priority.  Priorities of categories are also specified by a
5275    user in a Lisp variable `coding-category-list'.
5276
5277 */
5278
5279 #define EOL_SEEN_NONE   0
5280 #define EOL_SEEN_LF     1
5281 #define EOL_SEEN_CR     2
5282 #define EOL_SEEN_CRLF   4
5283
5284 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5285    SOURCE is encoded.  If CATEGORY is one of
5286    coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5287    two-byte, else they are encoded by one-byte.
5288
5289    Return one of EOL_SEEN_XXX.  */
5290
5291 #define MAX_EOL_CHECK_COUNT 3
5292
5293 static int
5294 detect_eol (source, src_bytes, category)
5295      const unsigned char *source;
5296      EMACS_INT src_bytes;
5297      enum coding_category category;
5298 {
5299   const unsigned char *src = source, *src_end = src + src_bytes;
5300   unsigned char c;
5301   int total  = 0;
5302   int eol_seen = EOL_SEEN_NONE;
5303
5304   if ((1 << category) & CATEGORY_MASK_UTF_16)
5305     {
5306       int msb, lsb;
5307
5308       msb = category == (coding_category_utf_16_le
5309                          | coding_category_utf_16_le_nosig);
5310       lsb = 1 - msb;
5311
5312       while (src + 1 < src_end)
5313         {
5314           c = src[lsb];
5315           if (src[msb] == 0 && (c == '\n' || c == '\r'))
5316             {
5317               int this_eol;
5318
5319               if (c == '\n')
5320                 this_eol = EOL_SEEN_LF;
5321               else if (src + 3 >= src_end
5322                        || src[msb + 2] != 0
5323                        || src[lsb + 2] != '\n')
5324                 this_eol = EOL_SEEN_CR;
5325               else
5326                 this_eol = EOL_SEEN_CRLF;
5327
5328               if (eol_seen == EOL_SEEN_NONE)
5329                 /* This is the first end-of-line.  */
5330                 eol_seen = this_eol;
5331               else if (eol_seen != this_eol)
5332                 {
5333                   /* The found type is different from what found before.  */
5334                   eol_seen = EOL_SEEN_LF;
5335                   break;
5336                 }
5337               if (++total == MAX_EOL_CHECK_COUNT)
5338                 break;
5339             }
5340           src += 2;
5341         }
5342     }
5343   else
5344     {
5345       while (src < src_end)
5346         {
5347           c = *src++;
5348           if (c == '\n' || c == '\r')
5349             {
5350               int this_eol;
5351
5352               if (c == '\n')
5353                 this_eol = EOL_SEEN_LF;
5354               else if (src >= src_end || *src != '\n')
5355                 this_eol = EOL_SEEN_CR;
5356               else
5357                 this_eol = EOL_SEEN_CRLF, src++;
5358
5359               if (eol_seen == EOL_SEEN_NONE)
5360                 /* This is the first end-of-line.  */
5361                 eol_seen = this_eol;
5362               else if (eol_seen != this_eol)
5363                 {
5364                   /* The found type is different from what found before.  */
5365                   eol_seen = EOL_SEEN_LF;
5366                   break;
5367                 }
5368               if (++total == MAX_EOL_CHECK_COUNT)
5369                 break;
5370             }
5371         }
5372     }
5373   return eol_seen;
5374 }
5375
5376
5377 static Lisp_Object
5378 adjust_coding_eol_type (coding, eol_seen)
5379      struct coding_system *coding;
5380      int eol_seen;
5381 {
5382   Lisp_Object eol_type;
5383
5384   eol_type = CODING_ID_EOL_TYPE (coding->id);
5385   if (eol_seen & EOL_SEEN_LF)
5386     {
5387       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5388       eol_type = Qunix;
5389     }
5390   else if (eol_seen & EOL_SEEN_CRLF)
5391     {
5392       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5393       eol_type = Qdos;
5394     }
5395   else if (eol_seen & EOL_SEEN_CR)
5396     {
5397       coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5398       eol_type = Qmac;
5399     }
5400   return eol_type;
5401 }
5402
5403 /* Detect how a text specified in CODING is encoded.  If a coding
5404    system is detected, update fields of CODING by the detected coding
5405    system.  */
5406
5407 void
5408 detect_coding (coding)
5409      struct coding_system *coding;
5410 {
5411   const unsigned char *src, *src_end;
5412
5413   coding->consumed = coding->consumed_char = 0;
5414   coding->produced = coding->produced_char = 0;
5415   coding_set_source (coding);
5416
5417   src_end = coding->source + coding->src_bytes;
5418
5419   /* If we have not yet decided the text encoding type, detect it
5420      now.  */
5421   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5422     {
5423       int c, i;
5424
5425       for (i = 0, src = coding->source; src < src_end; i++, src++)
5426         {
5427           c = *src;
5428           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5429                                         || c == ISO_CODE_SI
5430                                         || c == ISO_CODE_SO)))
5431             break;
5432         }
5433       coding->head_ascii = src - (coding->source + coding->consumed);
5434
5435       if (coding->head_ascii < coding->src_bytes)
5436         {
5437           struct coding_detection_info detect_info;
5438           enum coding_category category;
5439           struct coding_system *this;
5440
5441           detect_info.checked = detect_info.found = detect_info.rejected = 0;
5442           for (i = 0; i < coding_category_raw_text; i++)
5443             {
5444               category = coding_priorities[i];
5445               this = coding_categories + category;
5446               if (this->id < 0)
5447                 {
5448                   /* No coding system of this category is defined.  */
5449                   detect_info.rejected |= (1 << category);
5450                 }
5451               else if (category >= coding_category_raw_text)
5452                 continue;
5453               else if (detect_info.checked & (1 << category))
5454                 {
5455                   if (detect_info.found & (1 << category))
5456                     break;
5457                 }
5458               else if ((*(this->detector)) (coding, &detect_info)
5459                        && detect_info.found & (1 << category))
5460                 {
5461                   if (category == coding_category_utf_16_auto)
5462                     {
5463                       if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5464                         category = coding_category_utf_16_le;
5465                       else
5466                         category = coding_category_utf_16_be;
5467                     }
5468                   break;
5469                 }
5470             }
5471           if (i < coding_category_raw_text)
5472             setup_coding_system (CODING_ID_NAME (this->id), coding);
5473           else if (detect_info.rejected == CATEGORY_MASK_ANY)
5474             setup_coding_system (Qraw_text, coding);
5475           else if (detect_info.rejected)
5476             for (i = 0; i < coding_category_raw_text; i++)
5477               if (! (detect_info.rejected & (1 << coding_priorities[i])))
5478                 {
5479                   this = coding_categories + coding_priorities[i];
5480                   setup_coding_system (CODING_ID_NAME (this->id), coding);
5481                   break;
5482                 }
5483         }
5484     }
5485   else if (XINT (CODING_ATTR_CATEGORY (CODING_ID_ATTRS (coding->id)))
5486            == coding_category_utf_16_auto)
5487     {
5488       Lisp_Object coding_systems;
5489       struct coding_detection_info detect_info;
5490
5491       coding_systems
5492         = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5493       detect_info.found = detect_info.rejected = 0;
5494       if (CONSP (coding_systems)
5495           && detect_coding_utf_16 (coding, &detect_info))
5496         {
5497           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5498             setup_coding_system (XCAR (coding_systems), coding);
5499           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
5500             setup_coding_system (XCDR (coding_systems), coding);
5501         }
5502     }
5503 }
5504
5505
5506 static void
5507 decode_eol (coding)
5508      struct coding_system *coding;
5509 {
5510   Lisp_Object eol_type;
5511   unsigned char *p, *pbeg, *pend;
5512
5513   eol_type = CODING_ID_EOL_TYPE (coding->id);
5514   if (EQ (eol_type, Qunix))
5515     return;
5516
5517   if (NILP (coding->dst_object))
5518     pbeg = coding->destination;
5519   else
5520     pbeg = BYTE_POS_ADDR (coding->dst_pos_byte);
5521   pend = pbeg + coding->produced;
5522
5523   if (VECTORP (eol_type))
5524     {
5525       int eol_seen = EOL_SEEN_NONE;
5526
5527       for (p = pbeg; p < pend; p++)
5528         {
5529           if (*p == '\n')
5530             eol_seen |= EOL_SEEN_LF;
5531           else if (*p == '\r')
5532             {
5533               if (p + 1 < pend && *(p + 1) == '\n')
5534                 {
5535                   eol_seen |= EOL_SEEN_CRLF;
5536                   p++;
5537                 }
5538               else
5539                 eol_seen |= EOL_SEEN_CR;
5540             }
5541         }
5542       if (eol_seen != EOL_SEEN_NONE
5543           && eol_seen != EOL_SEEN_LF
5544           && eol_seen != EOL_SEEN_CRLF
5545           && eol_seen != EOL_SEEN_CR)
5546         eol_seen = EOL_SEEN_LF;
5547       if (eol_seen != EOL_SEEN_NONE)
5548         eol_type = adjust_coding_eol_type (coding, eol_seen);
5549     }
5550
5551   if (EQ (eol_type, Qmac))
5552     {
5553       for (p = pbeg; p < pend; p++)
5554         if (*p == '\r')
5555           *p = '\n';
5556     }
5557   else if (EQ (eol_type, Qdos))
5558     {
5559       int n = 0;
5560
5561       if (NILP (coding->dst_object))
5562         {
5563           for (p = pend - 2; p >= pbeg; p--)
5564             if (*p == '\r')
5565               {
5566                 safe_bcopy ((char *) (p + 1), (char *) p, pend-- - p - 1);
5567                 n++;
5568               }
5569         }
5570       else
5571         {
5572           for (p = pend - 2; p >= pbeg; p--)
5573             if (*p == '\r')
5574               {
5575                 int pos_byte = coding->dst_pos_byte + (p - pbeg);
5576                 int pos = BYTE_TO_CHAR (pos_byte);
5577
5578                 del_range_2 (pos, pos_byte, pos + 1, pos_byte + 1, 0);
5579                 n++;
5580               }
5581         }
5582       coding->produced -= n;
5583       coding->produced_char -= n;
5584     }
5585 }
5586
5587
5588 /* Return a translation table (or list of them) from coding system
5589    attribute vector ATTRS for encoding (ENCODEP is nonzero) or
5590    decoding (ENCODEP is zero). */
5591
5592 static Lisp_Object
5593 get_translation_table (attrs, encodep, max_lookup)
5594      Lisp_Object attrs;
5595      int encodep, *max_lookup;
5596 {
5597   Lisp_Object standard, translation_table;
5598   Lisp_Object val;
5599
5600   if (encodep)
5601     translation_table = CODING_ATTR_ENCODE_TBL (attrs),
5602       standard = Vstandard_translation_table_for_encode;
5603   else
5604     translation_table = CODING_ATTR_DECODE_TBL (attrs),
5605       standard = Vstandard_translation_table_for_decode;
5606   if (NILP (translation_table))
5607     translation_table = standard;
5608   else
5609     {
5610       if (SYMBOLP (translation_table))
5611         translation_table = Fget (translation_table, Qtranslation_table);
5612       else if (CONSP (translation_table))
5613         {
5614           translation_table = Fcopy_sequence (translation_table);
5615           for (val = translation_table; CONSP (val); val = XCDR (val))
5616             if (SYMBOLP (XCAR (val)))
5617               XSETCAR (val, Fget (XCAR (val), Qtranslation_table));
5618         }
5619       if (CHAR_TABLE_P (standard))
5620         {
5621           if (CONSP (translation_table))
5622             translation_table = nconc2 (translation_table,
5623                                         Fcons (standard, Qnil));
5624           else
5625             translation_table = Fcons (translation_table,
5626                                        Fcons (standard, Qnil));
5627         }
5628     }
5629
5630   if (max_lookup)
5631     {
5632       *max_lookup = 1;
5633       if (CHAR_TABLE_P (translation_table)
5634           && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (translation_table)) > 1)
5635         {
5636           val = XCHAR_TABLE (translation_table)->extras[1];
5637           if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5638             *max_lookup = XFASTINT (val);
5639         }
5640       else if (CONSP (translation_table))
5641         {
5642           Lisp_Object tail, val;
5643
5644           for (tail = translation_table; CONSP (tail); tail = XCDR (tail))
5645             if (CHAR_TABLE_P (XCAR (tail))
5646                 && CHAR_TABLE_EXTRA_SLOTS (XCHAR_TABLE (XCAR (tail))) > 1)
5647               {
5648                 val = XCHAR_TABLE (XCAR (tail))->extras[1];
5649                 if (NATNUMP (val) && *max_lookup < XFASTINT (val))
5650                   *max_lookup = XFASTINT (val);
5651               }
5652         }
5653     }
5654   return translation_table;
5655 }
5656
5657 #define LOOKUP_TRANSLATION_TABLE(table, c, trans)               \
5658   do {                                                          \
5659     trans = Qnil;                                               \
5660     if (CHAR_TABLE_P (table))                                   \
5661       {                                                         \
5662         trans = CHAR_TABLE_REF (table, c);                      \
5663         if (CHARACTERP (trans))                                 \
5664           c = XFASTINT (trans), trans = Qnil;                   \
5665       }                                                         \
5666     else if (CONSP (table))                                     \
5667       {                                                         \
5668         Lisp_Object tail;                                       \
5669                                                                 \
5670         for (tail = table; CONSP (tail); tail = XCDR (tail))    \
5671           if (CHAR_TABLE_P (XCAR (tail)))                       \
5672             {                                                   \
5673               trans = CHAR_TABLE_REF (XCAR (tail), c);          \
5674               if (CHARACTERP (trans))                           \
5675                 c = XFASTINT (trans), trans = Qnil;             \
5676               else if (! NILP (trans))                          \
5677                 break;                                          \
5678             }                                                   \
5679       }                                                         \
5680   } while (0)
5681
5682
5683 static Lisp_Object
5684 get_translation (val, buf, buf_end, last_block, from_nchars, to_nchars)
5685      Lisp_Object val;
5686      int *buf, *buf_end;
5687      int last_block;
5688      int *from_nchars, *to_nchars;
5689 {
5690   /* VAL is TO or (([FROM-CHAR ...] .  TO) ...) where TO is TO-CHAR or
5691      [TO-CHAR ...].  */
5692   if (CONSP (val))
5693     {
5694       Lisp_Object from, tail;
5695       int i, len;
5696
5697       for (tail = val; CONSP (tail); tail = XCDR (tail))
5698         {
5699           val = XCAR (tail);
5700           from = XCAR (val);
5701           len = ASIZE (from);
5702           for (i = 0; i < len; i++)
5703             {
5704               if (buf + i == buf_end)
5705                 {
5706                   if (! last_block)
5707                     return Qt;
5708                   break;
5709                 }
5710               if (XINT (AREF (from, i)) != buf[i])
5711                 break;
5712             }
5713           if (i == len)
5714             {
5715               val = XCDR (val);
5716               *from_nchars = len;
5717               break;
5718             }
5719         }
5720       if (! CONSP (tail))
5721         return Qnil;
5722     }
5723   if (VECTORP (val))
5724     *buf = XINT (AREF (val, 0)), *to_nchars = ASIZE (val);
5725   else
5726     *buf = XINT (val);
5727   return val;
5728 }
5729
5730
5731 static int
5732 produce_chars (coding, translation_table, last_block)
5733      struct coding_system *coding;
5734      Lisp_Object translation_table;
5735      int last_block;
5736 {
5737   unsigned char *dst = coding->destination + coding->produced;
5738   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5739   int produced;
5740   int produced_chars = 0;
5741   int carryover = 0;
5742
5743   if (! coding->chars_at_source)
5744     {
5745       /* Characters are in coding->charbuf.  */
5746       int *buf = coding->charbuf;
5747       int *buf_end = buf + coding->charbuf_used;
5748
5749       if (BUFFERP (coding->src_object)
5750           && EQ (coding->src_object, coding->dst_object))
5751         dst_end = ((unsigned char *) coding->source) + coding->consumed;
5752
5753       while (buf < buf_end)
5754         {
5755           int c = *buf, i;
5756
5757           if (c >= 0)
5758             {
5759               int from_nchars = 1, to_nchars = 1;
5760               Lisp_Object trans = Qnil;
5761
5762               LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
5763               if (! NILP (trans))
5764                 {
5765                   trans = get_translation (trans, buf, buf_end, last_block,
5766                                            &from_nchars, &to_nchars);
5767                   if (EQ (trans, Qt))
5768                     break;
5769                   c = *buf;
5770                 }
5771
5772               if (dst + MAX_MULTIBYTE_LENGTH * to_nchars > dst_end)
5773                 {
5774                   dst = alloc_destination (coding,
5775                                            buf_end - buf
5776                                            + MAX_MULTIBYTE_LENGTH * to_nchars,
5777                                            dst);
5778                   dst_end = coding->destination + coding->dst_bytes;
5779                 }
5780
5781               for (i = 0; i < to_nchars; i++)
5782                 {
5783                   if (i > 0)
5784                     c = XINT (AREF (trans, i));
5785                   if (coding->dst_multibyte
5786                       || ! CHAR_BYTE8_P (c))
5787                     CHAR_STRING_ADVANCE (c, dst);
5788                   else
5789                     *dst++ = CHAR_TO_BYTE8 (c);
5790                 }
5791               produced_chars += to_nchars;
5792               *buf++ = to_nchars;
5793               while (--from_nchars > 0)
5794                 *buf++ = 0;
5795             }
5796           else
5797             /* This is an annotation datum.  (-C) is the length.  */
5798             buf += -c;
5799         }
5800       carryover = buf_end - buf;
5801     }
5802   else
5803     {
5804       const unsigned char *src = coding->source;
5805       const unsigned char *src_end = src + coding->src_bytes;
5806       Lisp_Object eol_type;
5807
5808       eol_type = CODING_ID_EOL_TYPE (coding->id);
5809
5810       if (coding->src_multibyte != coding->dst_multibyte)
5811         {
5812           if (coding->src_multibyte)
5813             {
5814               int multibytep = 1;
5815               int consumed_chars;
5816
5817               while (1)
5818                 {
5819                   const unsigned char *src_base = src;
5820                   int c;
5821
5822                   ONE_MORE_BYTE (c);
5823                   if (c == '\r')
5824                     {
5825                       if (EQ (eol_type, Qdos))
5826                         {
5827                           if (src == src_end)
5828                             {
5829                               record_conversion_result
5830                                 (coding, CODING_RESULT_INSUFFICIENT_SRC);
5831                               goto no_more_source;
5832                             }
5833                           if (*src == '\n')
5834                             c = *src++;
5835                         }
5836                       else if (EQ (eol_type, Qmac))
5837                         c = '\n';
5838                     }
5839                   if (dst == dst_end)
5840                     {
5841                       coding->consumed = src - coding->source;
5842
5843                     if (EQ (coding->src_object, coding->dst_object))
5844                       dst_end = (unsigned char *) src;
5845                     if (dst == dst_end)
5846                       {
5847                         dst = alloc_destination (coding, src_end - src + 1,
5848                                                  dst);
5849                         dst_end = coding->destination + coding->dst_bytes;
5850                         coding_set_source (coding);
5851                         src = coding->source + coding->consumed;
5852                         src_end = coding->source + coding->src_bytes;
5853                       }
5854                     }
5855                   *dst++ = c;
5856                   produced_chars++;
5857                 }
5858             no_more_source:
5859               ;
5860             }
5861           else
5862             while (src < src_end)
5863               {
5864                 int multibytep = 1;
5865                 int c = *src++;
5866
5867                 if (c == '\r')
5868                   {
5869                     if (EQ (eol_type, Qdos))
5870                       {
5871                         if (src < src_end
5872                             && *src == '\n')
5873                           c = *src++;
5874                       }
5875                     else if (EQ (eol_type, Qmac))
5876                       c = '\n';
5877                   }
5878                 if (dst >= dst_end - 1)
5879                   {
5880                     coding->consumed = src - coding->source;
5881
5882                     if (EQ (coding->src_object, coding->dst_object))
5883                       dst_end = (unsigned char *) src;
5884                     if (dst >= dst_end - 1)
5885                       {
5886                         dst = alloc_destination (coding, src_end - src + 2,
5887                                                  dst);
5888                         dst_end = coding->destination + coding->dst_bytes;
5889                         coding_set_source (coding);
5890                         src = coding->source + coding->consumed;
5891                         src_end = coding->source + coding->src_bytes;
5892                       }
5893                   }
5894                 EMIT_ONE_BYTE (c);
5895               }
5896         }
5897       else
5898         {
5899           if (!EQ (coding->src_object, coding->dst_object))
5900             {
5901               int require = coding->src_bytes - coding->dst_bytes;
5902
5903               if (require > 0)
5904                 {
5905                   EMACS_INT offset = src - coding->source;
5906
5907                   dst = alloc_destination (coding, require, dst);
5908                   coding_set_source (coding);
5909                   src = coding->source + offset;
5910                   src_end = coding->source + coding->src_bytes;
5911                 }
5912             }
5913           produced_chars = coding->src_chars;
5914           while (src < src_end)
5915             {
5916               int c = *src++;
5917
5918               if (c == '\r')
5919                 {
5920                   if (EQ (eol_type, Qdos))
5921                     {
5922                       if (src < src_end
5923                           && *src == '\n')
5924                         c = *src++;
5925                       produced_chars--;
5926                     }
5927                   else if (EQ (eol_type, Qmac))
5928                     c = '\n';
5929                 }
5930               *dst++ = c;
5931             }
5932         }
5933       coding->consumed = coding->src_bytes;
5934       coding->consumed_char = coding->src_chars;
5935     }
5936
5937   produced = dst - (coding->destination + coding->produced);
5938   if (BUFFERP (coding->dst_object))
5939     insert_from_gap (produced_chars, produced);
5940   coding->produced += produced;
5941   coding->produced_char += produced_chars;
5942   return carryover;
5943 }
5944
5945 /* Compose text in CODING->object according to the annotation data at
5946    CHARBUF.  CHARBUF is an array:
5947      [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5948  */
5949
5950 static INLINE void
5951 produce_composition (coding, charbuf, pos)
5952      struct coding_system *coding;
5953      int *charbuf;
5954      EMACS_INT pos;
5955 {
5956   int len;
5957   EMACS_INT to;
5958   enum composition_method method;
5959   Lisp_Object components;
5960
5961   len = -charbuf[0];
5962   to = pos + charbuf[2];
5963   if (to <= pos)
5964     return;
5965   method = (enum composition_method) (charbuf[3]);
5966
5967   if (method == COMPOSITION_RELATIVE)
5968     components = Qnil;
5969   else if (method >= COMPOSITION_WITH_RULE
5970            && method <= COMPOSITION_WITH_RULE_ALTCHARS)
5971     {
5972       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5973       int i;
5974
5975       len -= 4;
5976       charbuf += 4;
5977       for (i = 0; i < len; i++)
5978         {
5979           args[i] = make_number (charbuf[i]);
5980           if (args[i] < 0)
5981             return;
5982         }
5983       components = (method == COMPOSITION_WITH_ALTCHARS
5984                     ? Fstring (len, args) : Fvector (len, args));
5985     }
5986   else
5987     return;
5988   compose_text (pos, to, components, Qnil, coding->dst_object);
5989 }
5990
5991
5992 /* Put `charset' property on text in CODING->object according to
5993    the annotation data at CHARBUF.  CHARBUF is an array:
5994      [ -LENGTH ANNOTATION_MASK NCHARS CHARSET-ID ]
5995  */
5996
5997 static INLINE void
5998 produce_charset (coding, charbuf, pos)
5999      struct coding_system *coding;
6000      int *charbuf;
6001      EMACS_INT pos;
6002 {
6003   EMACS_INT from = pos - charbuf[2];
6004   struct charset *charset = CHARSET_FROM_ID (charbuf[3]);
6005
6006   Fput_text_property (make_number (from), make_number (pos),
6007                       Qcharset, CHARSET_NAME (charset),
6008                       coding->dst_object);
6009 }
6010
6011
6012 #define CHARBUF_SIZE 0x4000
6013
6014 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
6015   do {                                                                  \
6016     int size = CHARBUF_SIZE;;                                           \
6017                                                                         \
6018     coding->charbuf = NULL;                                             \
6019     while (size > 1024)                                                 \
6020       {                                                                 \
6021         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
6022         if (coding->charbuf)                                            \
6023           break;                                                        \
6024         size >>= 1;                                                     \
6025       }                                                                 \
6026     if (! coding->charbuf)                                              \
6027       {                                                                 \
6028         record_conversion_result (coding, CODING_RESULT_INSUFFICIENT_MEM); \
6029         return coding->result;                                          \
6030       }                                                                 \
6031     coding->charbuf_size = size;                                        \
6032   } while (0)
6033
6034
6035 static void
6036 produce_annotation (coding, pos)
6037      struct coding_system *coding;
6038      EMACS_INT pos;
6039 {
6040   int *charbuf = coding->charbuf;
6041   int *charbuf_end = charbuf + coding->charbuf_used;
6042
6043   if (NILP (coding->dst_object))
6044     return;
6045
6046   while (charbuf < charbuf_end)
6047     {
6048       if (*charbuf >= 0)
6049         pos += *charbuf++;
6050       else
6051         {
6052           int len = -*charbuf;
6053           switch (charbuf[1])
6054             {
6055             case CODING_ANNOTATE_COMPOSITION_MASK:
6056               produce_composition (coding, charbuf, pos);
6057               break;
6058             case CODING_ANNOTATE_CHARSET_MASK:
6059               produce_charset (coding, charbuf, pos);
6060               break;
6061             default:
6062               abort ();
6063             }
6064           charbuf += len;
6065         }
6066     }
6067 }
6068
6069 /* Decode the data at CODING->src_object into CODING->dst_object.
6070    CODING->src_object is a buffer, a string, or nil.
6071    CODING->dst_object is a buffer.
6072
6073    If CODING->src_object is a buffer, it must be the current buffer.
6074    In this case, if CODING->src_pos is positive, it is a position of
6075    the source text in the buffer, otherwise, the source text is in the
6076    gap area of the buffer, and CODING->src_pos specifies the offset of
6077    the text from GPT (which must be the same as PT).  If this is the
6078    same buffer as CODING->dst_object, CODING->src_pos must be
6079    negative.
6080
6081    If CODING->src_object is a string, CODING->src_pos in an index to
6082    that string.
6083
6084    If CODING->src_object is nil, CODING->source must already point to
6085    the non-relocatable memory area.  In this case, CODING->src_pos is
6086    an offset from CODING->source.
6087
6088    The decoded data is inserted at the current point of the buffer
6089    CODING->dst_object.
6090 */
6091
6092 static int
6093 decode_coding (coding)
6094      struct coding_system *coding;
6095 {
6096   Lisp_Object attrs;
6097   Lisp_Object undo_list;
6098   Lisp_Object translation_table;
6099   int carryover;
6100   int i;
6101
6102   if (BUFFERP (coding->src_object)
6103       && coding->src_pos > 0
6104       && coding->src_pos < GPT
6105       && coding->src_pos + coding->src_chars > GPT)
6106     move_gap_both (coding->src_pos, coding->src_pos_byte);
6107
6108   undo_list = Qt;
6109   if (BUFFERP (coding->dst_object))
6110     {
6111       if (current_buffer != XBUFFER (coding->dst_object))
6112         set_buffer_internal (XBUFFER (coding->dst_object));
6113       if (GPT != PT)
6114         move_gap_both (PT, PT_BYTE);
6115       undo_list = current_buffer->undo_list;
6116       current_buffer->undo_list = Qt;
6117     }
6118
6119   coding->consumed = coding->consumed_char = 0;
6120   coding->produced = coding->produced_char = 0;
6121   coding->chars_at_source = 0;
6122   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6123   coding->errors = 0;
6124
6125   ALLOC_CONVERSION_WORK_AREA (coding);
6126
6127   attrs = CODING_ID_ATTRS (coding->id);
6128   translation_table = get_translation_table (attrs, 0, NULL);
6129
6130   carryover = 0;
6131   do
6132     {
6133       EMACS_INT pos = coding->dst_pos + coding->produced_char;
6134
6135       coding_set_source (coding);
6136       coding->annotated = 0;
6137       coding->charbuf_used = carryover;
6138       (*(coding->decoder)) (coding);
6139       coding_set_destination (coding);
6140       carryover = produce_chars (coding, translation_table, 0);
6141       if (coding->annotated)
6142         produce_annotation (coding, pos);
6143       for (i = 0; i < carryover; i++)
6144         coding->charbuf[i]
6145           = coding->charbuf[coding->charbuf_used - carryover + i];
6146     }
6147   while (coding->consumed < coding->src_bytes
6148          && ! coding->result);
6149
6150   if (carryover > 0)
6151     {
6152       coding_set_destination (coding);
6153       coding->charbuf_used = carryover;
6154       produce_chars (coding, translation_table, 1);
6155     }
6156
6157   coding->carryover_bytes = 0;
6158   if (coding->consumed < coding->src_bytes)
6159     {
6160       int nbytes = coding->src_bytes - coding->consumed;
6161       const unsigned char *src;
6162
6163       coding_set_source (coding);
6164       coding_set_destination (coding);
6165       src = coding->source + coding->consumed;
6166
6167       if (coding->mode & CODING_MODE_LAST_BLOCK)
6168         {
6169           /* Flush out unprocessed data as binary chars.  We are sure
6170              that the number of data is less than the size of
6171              coding->charbuf.  */
6172           coding->charbuf_used = 0;
6173           while (nbytes-- > 0)
6174             {
6175               int c = *src++;
6176
6177               coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
6178             }
6179           produce_chars (coding, Qnil, 1);
6180         }
6181       else
6182         {
6183           /* Record unprocessed bytes in coding->carryover.  We are
6184              sure that the number of data is less than the size of
6185              coding->carryover.  */
6186           unsigned char *p = coding->carryover;
6187
6188           coding->carryover_bytes = nbytes;
6189           while (nbytes-- > 0)
6190             *p++ = *src++;
6191         }
6192       coding->consumed = coding->src_bytes;
6193     }
6194
6195   if (BUFFERP (coding->dst_object))
6196     {
6197       current_buffer->undo_list = undo_list;
6198       record_insert (coding->dst_pos, coding->produced_char);
6199     }
6200   if (! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
6201     decode_eol (coding);
6202   return coding->result;
6203 }
6204
6205
6206 /* Extract an annotation datum from a composition starting at POS and
6207    ending before LIMIT of CODING->src_object (buffer or string), store
6208    the data in BUF, set *STOP to a starting position of the next
6209    composition (if any) or to LIMIT, and return the address of the
6210    next element of BUF.
6211
6212    If such an annotation is not found, set *STOP to a starting
6213    position of a composition after POS (if any) or to LIMIT, and
6214    return BUF.  */
6215
6216 static INLINE int *
6217 handle_composition_annotation (pos, limit, coding, buf, stop)
6218      EMACS_INT pos, limit;
6219      struct coding_system *coding;
6220      int *buf;
6221      EMACS_INT *stop;
6222 {
6223   EMACS_INT start, end;
6224   Lisp_Object prop;
6225
6226   if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
6227       || end > limit)
6228     *stop = limit;
6229   else if (start > pos)
6230     *stop = start;
6231   else
6232     {
6233       if (start == pos)
6234         {
6235           /* We found a composition.  Store the corresponding
6236              annotation data in BUF.  */
6237           int *head = buf;
6238           enum composition_method method = COMPOSITION_METHOD (prop);
6239           int nchars = COMPOSITION_LENGTH (prop);
6240
6241           ADD_COMPOSITION_DATA (buf, nchars, method);
6242           if (method != COMPOSITION_RELATIVE)
6243             {
6244               Lisp_Object components;
6245               int len, i, i_byte;
6246
6247               components = COMPOSITION_COMPONENTS (prop);
6248               if (VECTORP (components))
6249                 {
6250                   len = XVECTOR (components)->size;
6251                   for (i = 0; i < len; i++)
6252                     *buf++ = XINT (AREF (components, i));
6253                 }
6254               else if (STRINGP (components))
6255                 {
6256                   len = SCHARS (components);
6257                   i = i_byte = 0;
6258                   while (i < len)
6259                     {
6260                       FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
6261                       buf++;
6262                     }
6263                 }
6264               else if (INTEGERP (components))
6265                 {
6266                   len = 1;
6267                   *buf++ = XINT (components);
6268                 }
6269               else if (CONSP (components))
6270                 {
6271                   for (len = 0; CONSP (components);
6272                        len++, components = XCDR (components))
6273                     *buf++ = XINT (XCAR (components));
6274                 }
6275               else
6276                 abort ();
6277               *head -= len;
6278             }
6279         }
6280
6281       if (find_composition (end, limit, &start, &end, &prop,
6282                             coding->src_object)
6283           && end <= limit)
6284         *stop = start;
6285       else
6286         *stop = limit;
6287     }
6288   return buf;
6289 }
6290
6291
6292 /* Extract an annotation datum from a text property `charset' at POS of
6293    CODING->src_object (buffer of string), store the data in BUF, set
6294    *STOP to the position where the value of `charset' property changes
6295    (limiting by LIMIT), and return the address of the next element of
6296    BUF.
6297
6298    If the property value is nil, set *STOP to the position where the
6299    property value is non-nil (limiting by LIMIT), and return BUF.  */
6300
6301 static INLINE int *
6302 handle_charset_annotation (pos, limit, coding, buf, stop)
6303      EMACS_INT pos, limit;
6304      struct coding_system *coding;
6305      int *buf;
6306      EMACS_INT *stop;
6307 {
6308   Lisp_Object val, next;
6309   int id;
6310
6311   val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
6312   if (! NILP (val) && CHARSETP (val))
6313     id = XINT (CHARSET_SYMBOL_ID (val));
6314   else
6315     id = -1;
6316   ADD_CHARSET_DATA (buf, 0, id);
6317   next = Fnext_single_property_change (make_number (pos), Qcharset,
6318                                        coding->src_object,
6319                                        make_number (limit));
6320   *stop = XINT (next);
6321   return buf;
6322 }
6323
6324
6325 static void
6326 consume_chars (coding, translation_table, max_lookup)
6327      struct coding_system *coding;
6328      Lisp_Object translation_table;
6329      int max_lookup;
6330 {
6331   int *buf = coding->charbuf;
6332   int *buf_end = coding->charbuf + coding->charbuf_size;
6333   const unsigned char *src = coding->source + coding->consumed;
6334   const unsigned char *src_end = coding->source + coding->src_bytes;
6335   EMACS_INT pos = coding->src_pos + coding->consumed_char;
6336   EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6337   int multibytep = coding->src_multibyte;
6338   Lisp_Object eol_type;
6339   int c;
6340   EMACS_INT stop, stop_composition, stop_charset;
6341   int *lookup_buf = NULL;
6342
6343   if (! NILP (translation_table))
6344     lookup_buf = alloca (sizeof (int) * max_lookup);
6345
6346   eol_type = CODING_ID_EOL_TYPE (coding->id);
6347   if (VECTORP (eol_type))
6348     eol_type = Qunix;
6349
6350   /* Note: composition handling is not yet implemented.  */
6351   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6352
6353   if (NILP (coding->src_object))
6354     stop = stop_composition = stop_charset = end_pos;
6355   else
6356     {
6357       if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6358         stop = stop_composition = pos;
6359       else
6360         stop = stop_composition = end_pos;
6361       if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6362         stop = stop_charset = pos;
6363       else
6364         stop_charset = end_pos;
6365     }
6366
6367   /* Compensate for CRLF and conversion.  */
6368   buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6369   while (buf < buf_end)
6370     {
6371       Lisp_Object trans;
6372
6373       if (pos == stop)
6374         {
6375           if (pos == end_pos)
6376             break;
6377           if (pos == stop_composition)
6378             buf = handle_composition_annotation (pos, end_pos, coding,
6379                                                  buf, &stop_composition);
6380           if (pos == stop_charset)
6381             buf = handle_charset_annotation (pos, end_pos, coding,
6382                                              buf, &stop_charset);
6383           stop = (stop_composition < stop_charset
6384                   ? stop_composition : stop_charset);
6385         }
6386
6387       if (! multibytep)
6388         {
6389           EMACS_INT bytes;
6390
6391           if (coding->encoder == encode_coding_raw_text)
6392             c = *src++, pos++;
6393           else if ((bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6394             c = STRING_CHAR_ADVANCE (src), pos += bytes;
6395           else
6396             c = BYTE8_TO_CHAR (*src), src++, pos++;
6397         }
6398       else
6399         c = STRING_CHAR_ADVANCE (src), pos++;
6400       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6401         c = '\n';
6402       if (! EQ (eol_type, Qunix))
6403         {
6404           if (c == '\n')
6405             {
6406               if (EQ (eol_type, Qdos))
6407                 *buf++ = '\r';
6408               else
6409                 c = '\r';
6410             }
6411         }
6412
6413       trans = Qnil;
6414       LOOKUP_TRANSLATION_TABLE (translation_table, c, trans);
6415       if (NILP (trans))
6416         *buf++ = c;
6417       else
6418         {
6419           int from_nchars = 1, to_nchars = 1;
6420           int *lookup_buf_end;
6421           const unsigned char *p = src;
6422           int i;
6423
6424           lookup_buf[0] = c;
6425           for (i = 1; i < max_lookup && p < src_end; i++)
6426             lookup_buf[i] = STRING_CHAR_ADVANCE (p);
6427           lookup_buf_end = lookup_buf + i;
6428           trans = get_translation (trans, lookup_buf, lookup_buf_end, 1,
6429                                    &from_nchars, &to_nchars);
6430           if (EQ (trans, Qt)
6431               || buf + to_nchars > buf_end)
6432             break;
6433           *buf++ = *lookup_buf;
6434           for (i = 1; i < to_nchars; i++)
6435             *buf++ = XINT (AREF (trans, i));
6436           for (i = 1; i < from_nchars; i++, pos++)
6437             src += MULTIBYTE_LENGTH_NO_CHECK (src);
6438         }
6439     }
6440
6441   coding->consumed = src - coding->source;
6442   coding->consumed_char = pos - coding->src_pos;
6443   coding->charbuf_used = buf - coding->charbuf;
6444   coding->chars_at_source = 0;
6445 }
6446
6447
6448 /* Encode the text at CODING->src_object into CODING->dst_object.
6449    CODING->src_object is a buffer or a string.
6450    CODING->dst_object is a buffer or nil.
6451
6452    If CODING->src_object is a buffer, it must be the current buffer.
6453    In this case, if CODING->src_pos is positive, it is a position of
6454    the source text in the buffer, otherwise. the source text is in the
6455    gap area of the buffer, and coding->src_pos specifies the offset of
6456    the text from GPT (which must be the same as PT).  If this is the
6457    same buffer as CODING->dst_object, CODING->src_pos must be
6458    negative and CODING should not have `pre-write-conversion'.
6459
6460    If CODING->src_object is a string, CODING should not have
6461    `pre-write-conversion'.
6462
6463    If CODING->dst_object is a buffer, the encoded data is inserted at
6464    the current point of that buffer.
6465
6466    If CODING->dst_object is nil, the encoded data is placed at the
6467    memory area specified by CODING->destination.  */
6468
6469 static int
6470 encode_coding (coding)
6471      struct coding_system *coding;
6472 {
6473   Lisp_Object attrs;
6474   Lisp_Object translation_table;
6475   int max_lookup;
6476
6477   attrs = CODING_ID_ATTRS (coding->id);
6478   if (coding->encoder == encode_coding_raw_text)
6479     translation_table = Qnil, max_lookup = 0;
6480   else
6481     translation_table = get_translation_table (attrs, 1, &max_lookup);
6482
6483   if (BUFFERP (coding->dst_object))
6484     {
6485       set_buffer_internal (XBUFFER (coding->dst_object));
6486       coding->dst_multibyte
6487         = ! NILP (current_buffer->enable_multibyte_characters);
6488     }
6489
6490   coding->consumed = coding->consumed_char = 0;
6491   coding->produced = coding->produced_char = 0;
6492   record_conversion_result (coding, CODING_RESULT_SUCCESS);
6493   coding->errors = 0;
6494
6495   ALLOC_CONVERSION_WORK_AREA (coding);
6496
6497   do {
6498     coding_set_source (coding);
6499     consume_chars (coding, translation_table, max_lookup);
6500     coding_set_destination (coding);
6501     (*(coding->encoder)) (coding);
6502   } while (coding->consumed_char < coding->src_chars);
6503
6504   if (BUFFERP (coding->dst_object))
6505     insert_from_gap (coding->produced_char, coding->produced);
6506
6507   return (coding->result);
6508 }
6509
6510
6511 /* Name (or base name) of work buffer for code conversion.  */
6512 static Lisp_Object Vcode_conversion_workbuf_name;
6513
6514 /* A working buffer used by the top level conversion.  Once it is
6515    created, it is never destroyed.  It has the name
6516    Vcode_conversion_workbuf_name.  The other working buffers are
6517    destroyed after the use is finished, and their names are modified
6518    versions of Vcode_conversion_workbuf_name.  */
6519 static Lisp_Object Vcode_conversion_reused_workbuf;
6520
6521 /* 1 iff Vcode_conversion_reused_workbuf is already in use.  */
6522 static int reused_workbuf_in_use;
6523
6524
6525 /* Return a working buffer of code convesion.  MULTIBYTE specifies the
6526    multibyteness of returning buffer.  */
6527
6528 static Lisp_Object
6529 make_conversion_work_buffer (multibyte)
6530      int multibyte;
6531 {
6532   Lisp_Object name, workbuf;
6533   struct buffer *current;
6534
6535   if (reused_workbuf_in_use++)
6536     {
6537       name = Fgenerate_new_buffer_name (Vcode_conversion_workbuf_name, Qnil);
6538       workbuf = Fget_buffer_create (name);
6539     }
6540   else
6541     {
6542       name = Vcode_conversion_workbuf_name;
6543       workbuf = Fget_buffer_create (name);
6544       if (NILP (Vcode_conversion_reused_workbuf))
6545         Vcode_conversion_reused_workbuf = workbuf;
6546     }
6547   current = current_buffer;
6548   set_buffer_internal (XBUFFER (workbuf));
6549   Ferase_buffer ();
6550   current_buffer->undo_list = Qt;
6551   current_buffer->enable_multibyte_characters = multibyte ? Qt : Qnil;
6552   set_buffer_internal (current);
6553   return workbuf;
6554 }
6555
6556
6557 static Lisp_Object
6558 code_conversion_restore (arg)
6559      Lisp_Object arg;
6560 {
6561   Lisp_Object current, workbuf;
6562
6563   current = XCAR (arg);
6564   workbuf = XCDR (arg);
6565   if (! NILP (workbuf))
6566     {
6567       if (EQ (workbuf, Vcode_conversion_reused_workbuf))
6568         reused_workbuf_in_use = 0;
6569       else if (! NILP (Fbuffer_live_p (workbuf)))
6570         Fkill_buffer (workbuf);
6571     }
6572   set_buffer_internal (XBUFFER (current));
6573   return Qnil;
6574 }
6575
6576 Lisp_Object
6577 code_conversion_save (with_work_buf, multibyte)
6578      int with_work_buf, multibyte;
6579 {
6580   Lisp_Object workbuf = Qnil;
6581
6582   if (with_work_buf)
6583     workbuf = make_conversion_work_buffer (multibyte);
6584   record_unwind_protect (code_conversion_restore,
6585                          Fcons (Fcurrent_buffer (), workbuf));
6586   return workbuf;
6587 }
6588
6589 int
6590 decode_coding_gap (coding, chars, bytes)
6591      struct coding_system *coding;
6592      EMACS_INT chars, bytes;
6593 {
6594   int count = specpdl_ptr - specpdl;
6595   Lisp_Object attrs;
6596
6597   code_conversion_save (0, 0);
6598
6599   coding->src_object = Fcurrent_buffer ();
6600   coding->src_chars = chars;
6601   coding->src_bytes = bytes;
6602   coding->src_pos = -chars;
6603   coding->src_pos_byte = -bytes;
6604   coding->src_multibyte = chars < bytes;
6605   coding->dst_object = coding->src_object;
6606   coding->dst_pos = PT;
6607   coding->dst_pos_byte = PT_BYTE;
6608   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6609   coding->mode |= CODING_MODE_LAST_BLOCK;
6610
6611   if (CODING_REQUIRE_DETECTION (coding))
6612     detect_coding (coding);
6613
6614   decode_coding (coding);
6615
6616   attrs = CODING_ID_ATTRS (coding->id);
6617   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6618     {
6619       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6620       Lisp_Object val;
6621
6622       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6623       val = call1 (CODING_ATTR_POST_READ (attrs),
6624                    make_number (coding->produced_char));
6625       CHECK_NATNUM (val);
6626       coding->produced_char += Z - prev_Z;
6627       coding->produced += Z_BYTE - prev_Z_BYTE;
6628     }
6629
6630   unbind_to (count, Qnil);
6631   return coding->result;
6632 }
6633
6634 int
6635 encode_coding_gap (coding, chars, bytes)
6636      struct coding_system *coding;
6637      EMACS_INT chars, bytes;
6638 {
6639   int count = specpdl_ptr - specpdl;
6640
6641   code_conversion_save (0, 0);
6642
6643   coding->src_object = Fcurrent_buffer ();
6644   coding->src_chars = chars;
6645   coding->src_bytes = bytes;
6646   coding->src_pos = -chars;
6647   coding->src_pos_byte = -bytes;
6648   coding->src_multibyte = chars < bytes;
6649   coding->dst_object = coding->src_object;
6650   coding->dst_pos = PT;
6651   coding->dst_pos_byte = PT_BYTE;
6652
6653   encode_coding (coding);
6654
6655   unbind_to (count, Qnil);
6656   return coding->result;
6657 }
6658
6659
6660 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6661    SRC_OBJECT into DST_OBJECT by coding context CODING.
6662
6663    SRC_OBJECT is a buffer, a string, or Qnil.
6664
6665    If it is a buffer, the text is at point of the buffer.  FROM and TO
6666    are positions in the buffer.
6667
6668    If it is a string, the text is at the beginning of the string.
6669    FROM and TO are indices to the string.
6670
6671    If it is nil, the text is at coding->source.  FROM and TO are
6672    indices to coding->source.
6673
6674    DST_OBJECT is a buffer, Qt, or Qnil.
6675
6676    If it is a buffer, the decoded text is inserted at point of the
6677    buffer.  If the buffer is the same as SRC_OBJECT, the source text
6678    is deleted.
6679
6680    If it is Qt, a string is made from the decoded text, and
6681    set in CODING->dst_object.
6682
6683    If it is Qnil, the decoded text is stored at CODING->destination.
6684    The caller must allocate CODING->dst_bytes bytes at
6685    CODING->destination by xmalloc.  If the decoded text is longer than
6686    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6687  */
6688
6689 void
6690 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6691                       dst_object)
6692      struct coding_system *coding;
6693      Lisp_Object src_object;
6694      EMACS_INT from, from_byte, to, to_byte;
6695      Lisp_Object dst_object;
6696 {
6697   int count = specpdl_ptr - specpdl;
6698   unsigned char *destination;
6699   EMACS_INT dst_bytes;
6700   EMACS_INT chars = to - from;
6701   EMACS_INT bytes = to_byte - from_byte;
6702   Lisp_Object attrs;
6703   Lisp_Object buffer;
6704   int saved_pt = -1, saved_pt_byte;
6705
6706   buffer = Fcurrent_buffer ();
6707
6708   if (NILP (dst_object))
6709     {
6710       destination = coding->destination;
6711       dst_bytes = coding->dst_bytes;
6712     }
6713
6714   coding->src_object = src_object;
6715   coding->src_chars = chars;
6716   coding->src_bytes = bytes;
6717   coding->src_multibyte = chars < bytes;
6718
6719   if (STRINGP (src_object))
6720     {
6721       coding->src_pos = from;
6722       coding->src_pos_byte = from_byte;
6723     }
6724   else if (BUFFERP (src_object))
6725     {
6726       set_buffer_internal (XBUFFER (src_object));
6727       if (from != GPT)
6728         move_gap_both (from, from_byte);
6729       if (EQ (src_object, dst_object))
6730         {
6731           saved_pt = PT, saved_pt_byte = PT_BYTE;
6732           TEMP_SET_PT_BOTH (from, from_byte);
6733           del_range_both (from, from_byte, to, to_byte, 1);
6734           coding->src_pos = -chars;
6735           coding->src_pos_byte = -bytes;
6736         }
6737       else
6738         {
6739           coding->src_pos = from;
6740           coding->src_pos_byte = from_byte;
6741         }
6742     }
6743
6744   if (CODING_REQUIRE_DETECTION (coding))
6745     detect_coding (coding);
6746   attrs = CODING_ID_ATTRS (coding->id);
6747
6748   if (EQ (dst_object, Qt)
6749       || (! NILP (CODING_ATTR_POST_READ (attrs))
6750           && NILP (dst_object)))
6751     {
6752       coding->dst_object = code_conversion_save (1, 1);
6753       coding->dst_pos = BEG;
6754       coding->dst_pos_byte = BEG_BYTE;
6755       coding->dst_multibyte = 1;
6756     }
6757   else if (BUFFERP (dst_object))
6758     {
6759       code_conversion_save (0, 0);
6760       coding->dst_object = dst_object;
6761       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6762       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6763       coding->dst_multibyte
6764         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6765     }
6766   else
6767     {
6768       code_conversion_save (0, 0);
6769       coding->dst_object = Qnil;
6770       coding->dst_multibyte = 1;
6771     }
6772
6773   decode_coding (coding);
6774
6775   if (BUFFERP (coding->dst_object))
6776     set_buffer_internal (XBUFFER (coding->dst_object));
6777
6778   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6779     {
6780       struct gcpro gcpro1, gcpro2;
6781       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6782       Lisp_Object val;
6783
6784       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6785       GCPRO2 (coding->src_object, coding->dst_object);
6786       val = call1 (CODING_ATTR_POST_READ (attrs),
6787                    make_number (coding->produced_char));
6788       UNGCPRO;
6789       CHECK_NATNUM (val);
6790       coding->produced_char += Z - prev_Z;
6791       coding->produced += Z_BYTE - prev_Z_BYTE;
6792     }
6793
6794   if (EQ (dst_object, Qt))
6795     {
6796       coding->dst_object = Fbuffer_string ();
6797     }
6798   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6799     {
6800       set_buffer_internal (XBUFFER (coding->dst_object));
6801       if (dst_bytes < coding->produced)
6802         {
6803           destination
6804             = (unsigned char *) xrealloc (destination, coding->produced);
6805           if (! destination)
6806             {
6807               record_conversion_result (coding,
6808                                         CODING_RESULT_INSUFFICIENT_DST);
6809               unbind_to (count, Qnil);
6810               return;
6811             }
6812           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6813             move_gap_both (BEGV, BEGV_BYTE);
6814           bcopy (BEGV_ADDR, destination, coding->produced);
6815           coding->destination = destination;
6816         }
6817     }
6818
6819   if (saved_pt >= 0)
6820     {
6821       /* This is the case of:
6822          (BUFFERP (src_object) && EQ (src_object, dst_object))
6823          As we have moved PT while replacing the original buffer
6824          contents, we must recover it now.  */
6825       set_buffer_internal (XBUFFER (src_object));
6826       if (saved_pt < from)
6827         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6828       else if (saved_pt < from + chars)
6829         TEMP_SET_PT_BOTH (from, from_byte);
6830       else if (! NILP (current_buffer->enable_multibyte_characters))
6831         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6832                           saved_pt_byte + (coding->produced - bytes));
6833       else
6834         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6835                           saved_pt_byte + (coding->produced - bytes));
6836     }
6837
6838   unbind_to (count, coding->dst_object);
6839 }
6840
6841
6842 void
6843 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6844                       dst_object)
6845      struct coding_system *coding;
6846      Lisp_Object src_object;
6847      EMACS_INT from, from_byte, to, to_byte;
6848      Lisp_Object dst_object;
6849 {
6850   int count = specpdl_ptr - specpdl;
6851   EMACS_INT chars = to - from;
6852   EMACS_INT bytes = to_byte - from_byte;
6853   Lisp_Object attrs;
6854   Lisp_Object buffer;
6855   int saved_pt = -1, saved_pt_byte;
6856
6857   buffer = Fcurrent_buffer ();
6858
6859   coding->src_object = src_object;
6860   coding->src_chars = chars;
6861   coding->src_bytes = bytes;
6862   coding->src_multibyte = chars < bytes;
6863
6864   attrs = CODING_ID_ATTRS (coding->id);
6865
6866   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6867     {
6868       coding->src_object = code_conversion_save (1, coding->src_multibyte);
6869       set_buffer_internal (XBUFFER (coding->src_object));
6870       if (STRINGP (src_object))
6871         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6872       else if (BUFFERP (src_object))
6873         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6874       else
6875         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6876
6877       if (EQ (src_object, dst_object))
6878         {
6879           set_buffer_internal (XBUFFER (src_object));
6880           saved_pt = PT, saved_pt_byte = PT_BYTE;
6881           del_range_both (from, from_byte, to, to_byte, 1);
6882           set_buffer_internal (XBUFFER (coding->src_object));
6883         }
6884
6885       call2 (CODING_ATTR_PRE_WRITE (attrs),
6886              make_number (BEG), make_number (Z));
6887       coding->src_object = Fcurrent_buffer ();
6888       if (BEG != GPT)
6889         move_gap_both (BEG, BEG_BYTE);
6890       coding->src_chars = Z - BEG;
6891       coding->src_bytes = Z_BYTE - BEG_BYTE;
6892       coding->src_pos = BEG;
6893       coding->src_pos_byte = BEG_BYTE;
6894       coding->src_multibyte = Z < Z_BYTE;
6895     }
6896   else if (STRINGP (src_object))
6897     {
6898       code_conversion_save (0, 0);
6899       coding->src_pos = from;
6900       coding->src_pos_byte = from_byte;
6901     }
6902   else if (BUFFERP (src_object))
6903     {
6904       code_conversion_save (0, 0);
6905       set_buffer_internal (XBUFFER (src_object));
6906       if (EQ (src_object, dst_object))
6907         {
6908           saved_pt = PT, saved_pt_byte = PT_BYTE;
6909           coding->src_object = del_range_1 (from, to, 1, 1);
6910           coding->src_pos = 0;
6911           coding->src_pos_byte = 0;
6912         }
6913       else
6914         {
6915           if (from < GPT && to >= GPT)
6916             move_gap_both (from, from_byte);
6917           coding->src_pos = from;
6918           coding->src_pos_byte = from_byte;
6919         }
6920     }
6921   else
6922     code_conversion_save (0, 0);
6923
6924   if (BUFFERP (dst_object))
6925     {
6926       coding->dst_object = dst_object;
6927       if (EQ (src_object, dst_object))
6928         {
6929           coding->dst_pos = from;
6930           coding->dst_pos_byte = from_byte;
6931         }
6932       else
6933         {
6934           coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6935           coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6936         }
6937       coding->dst_multibyte
6938         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6939     }
6940   else if (EQ (dst_object, Qt))
6941     {
6942       coding->dst_object = Qnil;
6943       coding->dst_bytes = coding->src_chars;
6944       if (coding->dst_bytes == 0)
6945         coding->dst_bytes = 1;
6946       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6947       coding->dst_multibyte = 0;
6948     }
6949   else
6950     {
6951       coding->dst_object = Qnil;
6952       coding->dst_multibyte = 0;
6953     }
6954
6955   encode_coding (coding);
6956
6957   if (EQ (dst_object, Qt))
6958     {
6959       if (BUFFERP (coding->dst_object))
6960         coding->dst_object = Fbuffer_string ();
6961       else
6962         {
6963           coding->dst_object
6964             = make_unibyte_string ((char *) coding->destination,
6965                                    coding->produced);
6966           xfree (coding->destination);
6967         }
6968     }
6969
6970   if (saved_pt >= 0)
6971     {
6972       /* This is the case of:
6973          (BUFFERP (src_object) && EQ (src_object, dst_object))
6974          As we have moved PT while replacing the original buffer
6975          contents, we must recover it now.  */
6976       set_buffer_internal (XBUFFER (src_object));
6977       if (saved_pt < from)
6978         TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6979       else if (saved_pt < from + chars)
6980         TEMP_SET_PT_BOTH (from, from_byte);
6981       else if (! NILP (current_buffer->enable_multibyte_characters))
6982         TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6983                           saved_pt_byte + (coding->produced - bytes));
6984       else
6985         TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6986                           saved_pt_byte + (coding->produced - bytes));
6987     }
6988
6989   unbind_to (count, Qnil);
6990 }
6991
6992
6993 Lisp_Object
6994 preferred_coding_system ()
6995 {
6996   int id = coding_categories[coding_priorities[0]].id;
6997
6998   return CODING_ID_NAME (id);
6999 }
7000
7001 \f
7002 #ifdef emacs
7003 /*** 8. Emacs Lisp library functions ***/
7004
7005 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
7006        doc: /* Return t if OBJECT is nil or a coding-system.
7007 See the documentation of `define-coding-system' for information
7008 about coding-system objects.  */)
7009      (obj)
7010      Lisp_Object obj;
7011 {
7012   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
7013 }
7014
7015 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
7016        Sread_non_nil_coding_system, 1, 1, 0,
7017        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
7018      (prompt)
7019      Lisp_Object prompt;
7020 {
7021   Lisp_Object val;
7022   do
7023     {
7024       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7025                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
7026     }
7027   while (SCHARS (val) == 0);
7028   return (Fintern (val, Qnil));
7029 }
7030
7031 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
7032        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
7033 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
7034      (prompt, default_coding_system)
7035      Lisp_Object prompt, default_coding_system;
7036 {
7037   Lisp_Object val;
7038   if (SYMBOLP (default_coding_system))
7039     XSETSTRING (default_coding_system, XPNTR (SYMBOL_NAME (default_coding_system)));
7040   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
7041                           Qt, Qnil, Qcoding_system_history,
7042                           default_coding_system, Qnil);
7043   return (SCHARS (val) == 0 ? Qnil : Fintern (val, Qnil));
7044 }
7045
7046 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
7047        1, 1, 0,
7048        doc: /* Check validity of CODING-SYSTEM.
7049 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
7050 It is valid if it is nil or a symbol defined as a coding system by the
7051 function `define-coding-system'.  */)
7052   (coding_system)
7053      Lisp_Object coding_system;
7054 {
7055   CHECK_SYMBOL (coding_system);
7056   if (!NILP (Fcoding_system_p (coding_system)))
7057     return coding_system;
7058   while (1)
7059     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
7060 }
7061
7062 \f
7063 /* Detect how the bytes at SRC of length SRC_BYTES are encoded.  If
7064    HIGHEST is nonzero, return the coding system of the highest
7065    priority among the detected coding systems.  Otherwize return a
7066    list of detected coding systems sorted by their priorities.  If
7067    MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
7068    multibyte form but contains only ASCII and eight-bit chars.
7069    Otherwise, the bytes are raw bytes.
7070
7071    CODING-SYSTEM controls the detection as below:
7072
7073    If it is nil, detect both text-format and eol-format.  If the
7074    text-format part of CODING-SYSTEM is already specified
7075    (e.g. `iso-latin-1'), detect only eol-format.  If the eol-format
7076    part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
7077    detect only text-format.  */
7078
7079 Lisp_Object
7080 detect_coding_system (src, src_chars, src_bytes, highest, multibytep,
7081                       coding_system)
7082      const unsigned char *src;
7083      int src_chars, src_bytes, highest;
7084      int multibytep;
7085      Lisp_Object coding_system;
7086 {
7087   const unsigned char *src_end = src + src_bytes;
7088   Lisp_Object attrs, eol_type;
7089   Lisp_Object val;
7090   struct coding_system coding;
7091   int id;
7092   struct coding_detection_info detect_info;
7093   enum coding_category base_category;
7094
7095   if (NILP (coding_system))
7096     coding_system = Qundecided;
7097   setup_coding_system (coding_system, &coding);
7098   attrs = CODING_ID_ATTRS (coding.id);
7099   eol_type = CODING_ID_EOL_TYPE (coding.id);
7100   coding_system = CODING_ATTR_BASE_NAME (attrs);
7101
7102   coding.source = src;
7103   coding.src_chars = src_chars;
7104   coding.src_bytes = src_bytes;
7105   coding.src_multibyte = multibytep;
7106   coding.consumed = 0;
7107   coding.mode |= CODING_MODE_LAST_BLOCK;
7108
7109   detect_info.checked = detect_info.found = detect_info.rejected = 0;
7110
7111   /* At first, detect text-format if necessary.  */
7112   base_category = XINT (CODING_ATTR_CATEGORY (attrs));
7113   if (base_category == coding_category_undecided)
7114     {
7115       enum coding_category category;
7116       struct coding_system *this;
7117       int c, i;
7118
7119       /* Skip all ASCII bytes except for a few ISO2022 controls.  */
7120       for (i = 0; src < src_end; i++, src++)
7121         {
7122           c = *src;
7123           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
7124                                         || c == ISO_CODE_SI
7125                                         || c == ISO_CODE_SO)))
7126             break;
7127         }
7128       coding.head_ascii = src - coding.source;
7129
7130       if (src < src_end)
7131         for (i = 0; i < coding_category_raw_text; i++)
7132           {
7133             category = coding_priorities[i];
7134             this = coding_categories + category;
7135
7136             if (this->id < 0)
7137               {
7138                 /* No coding system of this category is defined.  */
7139                 detect_info.rejected |= (1 << category);
7140               }
7141             else if (category >= coding_category_raw_text)
7142               continue;
7143             else if (detect_info.checked & (1 << category))
7144               {
7145                 if (highest
7146                     && (detect_info.found & (1 << category)))
7147                   break;
7148               }
7149             else
7150               {
7151                 if ((*(this->detector)) (&coding, &detect_info)
7152                     && highest
7153                     && (detect_info.found & (1 << category)))
7154                   {
7155                     if (category == coding_category_utf_16_auto)
7156                       {
7157                         if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7158                           category = coding_category_utf_16_le;
7159                         else
7160                           category = coding_category_utf_16_be;
7161                       }
7162                     break;
7163                   }
7164               }
7165           }
7166
7167       if (detect_info.rejected == CATEGORY_MASK_ANY)
7168         {
7169           detect_info.found = CATEGORY_MASK_RAW_TEXT;
7170           id = coding_categories[coding_category_raw_text].id;
7171           val = Fcons (make_number (id), Qnil);
7172         }
7173       else if (! detect_info.rejected && ! detect_info.found)
7174         {
7175           detect_info.found = CATEGORY_MASK_ANY;
7176           id = coding_categories[coding_category_undecided].id;
7177           val = Fcons (make_number (id), Qnil);
7178         }
7179       else if (highest)
7180         {
7181           if (detect_info.found)
7182             {
7183               detect_info.found = 1 << category;
7184               val = Fcons (make_number (this->id), Qnil);
7185             }
7186           else
7187             for (i = 0; i < coding_category_raw_text; i++)
7188               if (! (detect_info.rejected & (1 << coding_priorities[i])))
7189                 {
7190                   detect_info.found = 1 << coding_priorities[i];
7191                   id = coding_categories[coding_priorities[i]].id;
7192                   val = Fcons (make_number (id), Qnil);
7193                   break;
7194                 }
7195         }
7196       else
7197         {
7198           int mask = detect_info.rejected | detect_info.found;
7199           int found = 0;
7200           val = Qnil;
7201
7202           for (i = coding_category_raw_text - 1; i >= 0; i--)
7203             {
7204               category = coding_priorities[i];
7205               if (! (mask & (1 << category)))
7206                 {
7207                   found |= 1 << category;
7208                   id = coding_categories[category].id;
7209                   val = Fcons (make_number (id), val);
7210                 }
7211             }
7212           for (i = coding_category_raw_text - 1; i >= 0; i--)
7213             {
7214               category = coding_priorities[i];
7215               if (detect_info.found & (1 << category))
7216                 {
7217                   id = coding_categories[category].id;
7218                   val = Fcons (make_number (id), val);
7219                 }
7220             }
7221           detect_info.found |= found;
7222         }
7223     }
7224   else if (base_category == coding_category_utf_16_auto)
7225     {
7226       if (detect_coding_utf_16 (&coding, &detect_info))
7227         {
7228           struct coding_system *this;
7229
7230           if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
7231             this = coding_categories + coding_category_utf_16_le;
7232           else if (detect_info.found & CATEGORY_MASK_UTF_16_BE)
7233             this = coding_categories + coding_category_utf_16_be;
7234           else if (detect_info.rejected & CATEGORY_MASK_UTF_16_LE_NOSIG)
7235             this = coding_categories + coding_category_utf_16_be_nosig;
7236           else
7237             this = coding_categories + coding_category_utf_16_le_nosig;
7238           val = Fcons (make_number (this->id), Qnil);
7239         }
7240     }
7241   else
7242     {
7243       detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
7244       val = Fcons (make_number (coding.id), Qnil);
7245     }
7246
7247   /* Then, detect eol-format if necessary.  */
7248   {
7249     int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
7250     Lisp_Object tail;
7251
7252     if (VECTORP (eol_type))
7253       {
7254         if (detect_info.found & ~CATEGORY_MASK_UTF_16)
7255           normal_eol = detect_eol (coding.source, src_bytes,
7256                                    coding_category_raw_text);
7257         if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
7258                                  | CATEGORY_MASK_UTF_16_BE_NOSIG))
7259           utf_16_be_eol = detect_eol (coding.source, src_bytes,
7260                                       coding_category_utf_16_be);
7261         if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
7262                                  | CATEGORY_MASK_UTF_16_LE_NOSIG))
7263           utf_16_le_eol = detect_eol (coding.source, src_bytes,
7264                                       coding_category_utf_16_le);
7265       }
7266     else
7267       {
7268         if (EQ (eol_type, Qunix))
7269           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
7270         else if (EQ (eol_type, Qdos))
7271           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
7272         else
7273           normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
7274       }
7275
7276     for (tail = val; CONSP (tail); tail = XCDR (tail))
7277       {
7278         enum coding_category category;
7279         int this_eol;
7280
7281         id = XINT (XCAR (tail));
7282         attrs = CODING_ID_ATTRS (id);
7283         category = XINT (CODING_ATTR_CATEGORY (attrs));
7284         eol_type = CODING_ID_EOL_TYPE (id);
7285         if (VECTORP (eol_type))
7286           {
7287             if (category == coding_category_utf_16_be
7288                 || category == coding_category_utf_16_be_nosig)
7289               this_eol = utf_16_be_eol;
7290             else if (category == coding_category_utf_16_le
7291                      || category == coding_category_utf_16_le_nosig)
7292               this_eol = utf_16_le_eol;
7293             else
7294               this_eol = normal_eol;
7295
7296             if (this_eol == EOL_SEEN_LF)
7297               XSETCAR (tail, AREF (eol_type, 0));
7298             else if (this_eol == EOL_SEEN_CRLF)
7299               XSETCAR (tail, AREF (eol_type, 1));
7300             else if (this_eol == EOL_SEEN_CR)
7301               XSETCAR (tail, AREF (eol_type, 2));
7302             else
7303               XSETCAR (tail, CODING_ID_NAME (id));
7304           }
7305         else
7306           XSETCAR (tail, CODING_ID_NAME (id));
7307       }
7308   }
7309
7310   return (highest ? XCAR (val) : val);
7311 }
7312
7313
7314 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
7315        2, 3, 0,
7316        doc: /* Detect coding system of the text in the region between START and END.
7317 Return a list of possible coding systems ordered by priority.
7318
7319 If only ASCII characters are found, it returns a list of single element
7320 `undecided' or its subsidiary coding system according to a detected
7321 end-of-line format.
7322
7323 If optional argument HIGHEST is non-nil, return the coding system of
7324 highest priority.  */)
7325      (start, end, highest)
7326      Lisp_Object start, end, highest;
7327 {
7328   int from, to;
7329   int from_byte, to_byte;
7330
7331   CHECK_NUMBER_COERCE_MARKER (start);
7332   CHECK_NUMBER_COERCE_MARKER (end);
7333
7334   validate_region (&start, &end);
7335   from = XINT (start), to = XINT (end);
7336   from_byte = CHAR_TO_BYTE (from);
7337   to_byte = CHAR_TO_BYTE (to);
7338
7339   if (from < GPT && to >= GPT)
7340     move_gap_both (to, to_byte);
7341
7342   return detect_coding_system (BYTE_POS_ADDR (from_byte),
7343                                to - from, to_byte - from_byte,
7344                                !NILP (highest),
7345                                !NILP (current_buffer
7346                                       ->enable_multibyte_characters),
7347                                Qnil);
7348 }
7349
7350 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
7351        1, 2, 0,
7352        doc: /* Detect coding system of the text in STRING.
7353 Return a list of possible coding systems ordered by priority.
7354
7355 If only ASCII characters are found, it returns a list of single element
7356 `undecided' or its subsidiary coding system according to a detected
7357 end-of-line format.
7358
7359 If optional argument HIGHEST is non-nil, return the coding system of
7360 highest priority.  */)
7361      (string, highest)
7362      Lisp_Object string, highest;
7363 {
7364   CHECK_STRING (string);
7365
7366   return detect_coding_system (SDATA (string),
7367                                SCHARS (string), SBYTES (string),
7368                                !NILP (highest), STRING_MULTIBYTE (string),
7369                                Qnil);
7370 }
7371
7372
7373 static INLINE int
7374 char_encodable_p (c, attrs)
7375      int c;
7376      Lisp_Object attrs;
7377 {
7378   Lisp_Object tail;
7379   struct charset *charset;
7380   Lisp_Object translation_table;
7381
7382   translation_table = CODING_ATTR_TRANS_TBL (attrs);
7383   if (! NILP (translation_table))
7384     c = translate_char (translation_table, c);
7385   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7386        CONSP (tail); tail = XCDR (tail))
7387     {
7388       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7389       if (CHAR_CHARSET_P (c, charset))
7390         break;
7391     }
7392   return (! NILP (tail));
7393 }
7394
7395
7396 /* Return a list of coding systems that safely encode the text between
7397    START and END.  If EXCLUDE is non-nil, it is a list of coding
7398    systems not to check.  The returned list doesn't contain any such
7399    coding systems.  In any case, if the text contains only ASCII or is
7400    unibyte, return t.  */
7401
7402 DEFUN ("find-coding-systems-region-internal",
7403        Ffind_coding_systems_region_internal,
7404        Sfind_coding_systems_region_internal, 2, 3, 0,
7405        doc: /* Internal use only.  */)
7406      (start, end, exclude)
7407      Lisp_Object start, end, exclude;
7408 {
7409   Lisp_Object coding_attrs_list, safe_codings;
7410   EMACS_INT start_byte, end_byte;
7411   const unsigned char *p, *pbeg, *pend;
7412   int c;
7413   Lisp_Object tail, elt;
7414
7415   if (STRINGP (start))
7416     {
7417       if (!STRING_MULTIBYTE (start)
7418           || SCHARS (start) == SBYTES (start))
7419         return Qt;
7420       start_byte = 0;
7421       end_byte = SBYTES (start);
7422     }
7423   else
7424     {
7425       CHECK_NUMBER_COERCE_MARKER (start);
7426       CHECK_NUMBER_COERCE_MARKER (end);
7427       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7428         args_out_of_range (start, end);
7429       if (NILP (current_buffer->enable_multibyte_characters))
7430         return Qt;
7431       start_byte = CHAR_TO_BYTE (XINT (start));
7432       end_byte = CHAR_TO_BYTE (XINT (end));
7433       if (XINT (end) - XINT (start) == end_byte - start_byte)
7434         return Qt;
7435
7436       if (XINT (start) < GPT && XINT (end) > GPT)
7437         {
7438           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7439             move_gap_both (XINT (start), start_byte);
7440           else
7441             move_gap_both (XINT (end), end_byte);
7442         }
7443     }
7444
7445   coding_attrs_list = Qnil;
7446   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7447     if (NILP (exclude)
7448         || NILP (Fmemq (XCAR (tail), exclude)))
7449       {
7450         Lisp_Object attrs;
7451
7452         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7453         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7454             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7455           {
7456             ASET (attrs, coding_attr_trans_tbl,
7457                   get_translation_table (attrs, 1, NULL));
7458             coding_attrs_list = Fcons (attrs, coding_attrs_list);
7459           }
7460       }
7461
7462   if (STRINGP (start))
7463     p = pbeg = SDATA (start);
7464   else
7465     p = pbeg = BYTE_POS_ADDR (start_byte);
7466   pend = p + (end_byte - start_byte);
7467
7468   while (p < pend && ASCII_BYTE_P (*p)) p++;
7469   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7470
7471   while (p < pend)
7472     {
7473       if (ASCII_BYTE_P (*p))
7474         p++;
7475       else
7476         {
7477           c = STRING_CHAR_ADVANCE (p);
7478
7479           charset_map_loaded = 0;
7480           for (tail = coding_attrs_list; CONSP (tail);)
7481             {
7482               elt = XCAR (tail);
7483               if (NILP (elt))
7484                 tail = XCDR (tail);
7485               else if (char_encodable_p (c, elt))
7486                 tail = XCDR (tail);
7487               else if (CONSP (XCDR (tail)))
7488                 {
7489                   XSETCAR (tail, XCAR (XCDR (tail)));
7490                   XSETCDR (tail, XCDR (XCDR (tail)));
7491                 }
7492               else
7493                 {
7494                   XSETCAR (tail, Qnil);
7495                   tail = XCDR (tail);
7496                 }
7497             }
7498           if (charset_map_loaded)
7499             {
7500               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7501
7502               if (STRINGP (start))
7503                 pbeg = SDATA (start);
7504               else
7505                 pbeg = BYTE_POS_ADDR (start_byte);
7506               p = pbeg + p_offset;
7507               pend = pbeg + pend_offset;
7508             }
7509         }
7510     }
7511
7512   safe_codings = list2 (Qraw_text, Qno_conversion);
7513   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7514     if (! NILP (XCAR (tail)))
7515       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7516
7517   return safe_codings;
7518 }
7519
7520
7521 DEFUN ("unencodable-char-position", Funencodable_char_position,
7522        Sunencodable_char_position, 3, 5, 0,
7523        doc: /*
7524 Return position of first un-encodable character in a region.
7525 START and END specfiy the region and CODING-SYSTEM specifies the
7526 encoding to check.  Return nil if CODING-SYSTEM does encode the region.
7527
7528 If optional 4th argument COUNT is non-nil, it specifies at most how
7529 many un-encodable characters to search.  In this case, the value is a
7530 list of positions.
7531
7532 If optional 5th argument STRING is non-nil, it is a string to search
7533 for un-encodable characters.  In that case, START and END are indexes
7534 to the string.  */)
7535      (start, end, coding_system, count, string)
7536      Lisp_Object start, end, coding_system, count, string;
7537 {
7538   int n;
7539   struct coding_system coding;
7540   Lisp_Object attrs, charset_list, translation_table;
7541   Lisp_Object positions;
7542   int from, to;
7543   const unsigned char *p, *stop, *pend;
7544   int ascii_compatible;
7545
7546   setup_coding_system (Fcheck_coding_system (coding_system), &coding);
7547   attrs = CODING_ID_ATTRS (coding.id);
7548   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
7549     return Qnil;
7550   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
7551   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7552   translation_table = get_translation_table (attrs, 1, NULL);
7553
7554   if (NILP (string))
7555     {
7556       validate_region (&start, &end);
7557       from = XINT (start);
7558       to = XINT (end);
7559       if (NILP (current_buffer->enable_multibyte_characters)
7560           || (ascii_compatible
7561               && (to - from) == (CHAR_TO_BYTE (to) - (CHAR_TO_BYTE (from)))))
7562         return Qnil;
7563       p = CHAR_POS_ADDR (from);
7564       pend = CHAR_POS_ADDR (to);
7565       if (from < GPT && to >= GPT)
7566         stop = GPT_ADDR;
7567       else
7568         stop = pend;
7569     }
7570   else
7571     {
7572       CHECK_STRING (string);
7573       CHECK_NATNUM (start);
7574       CHECK_NATNUM (end);
7575       from = XINT (start);
7576       to = XINT (end);
7577       if (from > to
7578           || to > SCHARS (string))
7579         args_out_of_range_3 (string, start, end);
7580       if (! STRING_MULTIBYTE (string))
7581         return Qnil;
7582       p = SDATA (string) + string_char_to_byte (string, from);
7583       stop = pend = SDATA (string) + string_char_to_byte (string, to);
7584       if (ascii_compatible && (to - from) == (pend - p))
7585         return Qnil;
7586     }
7587
7588   if (NILP (count))
7589     n = 1;
7590   else
7591     {
7592       CHECK_NATNUM (count);
7593       n = XINT (count);
7594     }
7595
7596   positions = Qnil;
7597   while (1)
7598     {
7599       int c;
7600
7601       if (ascii_compatible)
7602         while (p < stop && ASCII_BYTE_P (*p))
7603           p++, from++;
7604       if (p >= stop)
7605         {
7606           if (p >= pend)
7607             break;
7608           stop = pend;
7609           p = GAP_END_ADDR;
7610         }
7611
7612       c = STRING_CHAR_ADVANCE (p);
7613       if (! (ASCII_CHAR_P (c) && ascii_compatible)
7614           && ! char_charset (translate_char (translation_table, c),
7615                              charset_list, NULL))
7616         {
7617           positions = Fcons (make_number (from), positions);
7618           n--;
7619           if (n == 0)
7620             break;
7621         }
7622
7623       from++;
7624     }
7625
7626   return (NILP (count) ? Fcar (positions) : Fnreverse (positions));
7627 }
7628
7629
7630 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7631        Scheck_coding_systems_region, 3, 3, 0,
7632        doc: /* Check if the region is encodable by coding systems.
7633
7634 START and END are buffer positions specifying the region.
7635 CODING-SYSTEM-LIST is a list of coding systems to check.
7636
7637 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7638 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7639 whole region, POS0, POS1, ... are buffer positions where non-encodable
7640 characters are found.
7641
7642 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7643 value is nil.
7644
7645 START may be a string.  In that case, check if the string is
7646 encodable, and the value contains indices to the string instead of
7647 buffer positions.  END is ignored.  */)
7648      (start, end, coding_system_list)
7649      Lisp_Object start, end, coding_system_list;
7650 {
7651   Lisp_Object list;
7652   EMACS_INT start_byte, end_byte;
7653   int pos;
7654   const unsigned char *p, *pbeg, *pend;
7655   int c;
7656   Lisp_Object tail, elt, attrs;
7657
7658   if (STRINGP (start))
7659     {
7660       if (!STRING_MULTIBYTE (start)
7661           && SCHARS (start) != SBYTES (start))
7662         return Qnil;
7663       start_byte = 0;
7664       end_byte = SBYTES (start);
7665       pos = 0;
7666     }
7667   else
7668     {
7669       CHECK_NUMBER_COERCE_MARKER (start);
7670       CHECK_NUMBER_COERCE_MARKER (end);
7671       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7672         args_out_of_range (start, end);
7673       if (NILP (current_buffer->enable_multibyte_characters))
7674         return Qnil;
7675       start_byte = CHAR_TO_BYTE (XINT (start));
7676       end_byte = CHAR_TO_BYTE (XINT (end));
7677       if (XINT (end) - XINT (start) == end_byte - start_byte)
7678         return Qt;
7679
7680       if (XINT (start) < GPT && XINT (end) > GPT)
7681         {
7682           if ((GPT - XINT (start)) < (XINT (end) - GPT))
7683             move_gap_both (XINT (start), start_byte);
7684           else
7685             move_gap_both (XINT (end), end_byte);
7686         }
7687       pos = XINT (start);
7688     }
7689
7690   list = Qnil;
7691   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7692     {
7693       elt = XCAR (tail);
7694       attrs = AREF (CODING_SYSTEM_SPEC (elt), 0);
7695       ASET (attrs, coding_attr_trans_tbl,
7696             get_translation_table (attrs, 1, NULL));
7697       list = Fcons (Fcons (elt, Fcons (attrs, Qnil)), list);
7698     }
7699
7700   if (STRINGP (start))
7701     p = pbeg = SDATA (start);
7702   else
7703     p = pbeg = BYTE_POS_ADDR (start_byte);
7704   pend = p + (end_byte - start_byte);
7705
7706   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7707   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7708
7709   while (p < pend)
7710     {
7711       if (ASCII_BYTE_P (*p))
7712         p++;
7713       else
7714         {
7715           c = STRING_CHAR_ADVANCE (p);
7716
7717           charset_map_loaded = 0;
7718           for (tail = list; CONSP (tail); tail = XCDR (tail))
7719             {
7720               elt = XCDR (XCAR (tail));
7721               if (! char_encodable_p (c, XCAR (elt)))
7722                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7723             }
7724           if (charset_map_loaded)
7725             {
7726               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7727
7728               if (STRINGP (start))
7729                 pbeg = SDATA (start);
7730               else
7731                 pbeg = BYTE_POS_ADDR (start_byte);
7732               p = pbeg + p_offset;
7733               pend = pbeg + pend_offset;
7734             }
7735         }
7736       pos++;
7737     }
7738
7739   tail = list;
7740   list = Qnil;
7741   for (; CONSP (tail); tail = XCDR (tail))
7742     {
7743       elt = XCAR (tail);
7744       if (CONSP (XCDR (XCDR (elt))))
7745         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7746                       list);
7747     }
7748
7749   return list;
7750 }
7751
7752
7753 Lisp_Object
7754 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7755      Lisp_Object start, end, coding_system, dst_object;
7756      int encodep, norecord;
7757 {
7758   struct coding_system coding;
7759   EMACS_INT from, from_byte, to, to_byte;
7760   Lisp_Object src_object;
7761
7762   CHECK_NUMBER_COERCE_MARKER (start);
7763   CHECK_NUMBER_COERCE_MARKER (end);
7764   if (NILP (coding_system))
7765     coding_system = Qno_conversion;
7766   else
7767     CHECK_CODING_SYSTEM (coding_system);
7768   src_object = Fcurrent_buffer ();
7769   if (NILP (dst_object))
7770     dst_object = src_object;
7771   else if (! EQ (dst_object, Qt))
7772     CHECK_BUFFER (dst_object);
7773
7774   validate_region (&start, &end);
7775   from = XFASTINT (start);
7776   from_byte = CHAR_TO_BYTE (from);
7777   to = XFASTINT (end);
7778   to_byte = CHAR_TO_BYTE (to);
7779
7780   setup_coding_system (coding_system, &coding);
7781   coding.mode |= CODING_MODE_LAST_BLOCK;
7782
7783   if (encodep)
7784     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7785                           dst_object);
7786   else
7787     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7788                           dst_object);
7789   if (! norecord)
7790     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7791
7792   return (BUFFERP (dst_object)
7793           ? make_number (coding.produced_char)
7794           : coding.dst_object);
7795 }
7796
7797
7798 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7799        3, 4, "r\nzCoding system: ",
7800        doc: /* Decode the current region from the specified coding system.
7801 When called from a program, takes four arguments:
7802         START, END, CODING-SYSTEM, and DESTINATION.
7803 START and END are buffer positions.
7804
7805 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7806 If nil, the region between START and END is replace by the decoded text.
7807 If buffer, the decoded text is inserted in the buffer.
7808 If t, the decoded text is returned.
7809
7810 This function sets `last-coding-system-used' to the precise coding system
7811 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7812 not fully specified.)
7813 It returns the length of the decoded text.  */)
7814      (start, end, coding_system, destination)
7815      Lisp_Object start, end, coding_system, destination;
7816 {
7817   return code_convert_region (start, end, coding_system, destination, 0, 0);
7818 }
7819
7820 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7821        3, 4, "r\nzCoding system: ",
7822        doc: /* Encode the current region by specified coding system.
7823 When called from a program, takes three arguments:
7824 START, END, and CODING-SYSTEM.  START and END are buffer positions.
7825
7826 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7827 If nil, the region between START and END is replace by the encoded text.
7828 If buffer, the encoded text is inserted in the buffer.
7829 If t, the encoded text is returned.
7830
7831 This function sets `last-coding-system-used' to the precise coding system
7832 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7833 not fully specified.)
7834 It returns the length of the encoded text.  */)
7835   (start, end, coding_system, destination)
7836      Lisp_Object start, end, coding_system, destination;
7837 {
7838   return code_convert_region (start, end, coding_system, destination, 1, 0);
7839 }
7840
7841 Lisp_Object
7842 code_convert_string (string, coding_system, dst_object,
7843                      encodep, nocopy, norecord)
7844      Lisp_Object string, coding_system, dst_object;
7845      int encodep, nocopy, norecord;
7846 {
7847   struct coding_system coding;
7848   EMACS_INT chars, bytes;
7849
7850   CHECK_STRING (string);
7851   if (NILP (coding_system))
7852     {
7853       if (! norecord)
7854         Vlast_coding_system_used = Qno_conversion;
7855       if (NILP (dst_object))
7856         return (nocopy ? Fcopy_sequence (string) : string);
7857     }
7858
7859   if (NILP (coding_system))
7860     coding_system = Qno_conversion;
7861   else
7862     CHECK_CODING_SYSTEM (coding_system);
7863   if (NILP (dst_object))
7864     dst_object = Qt;
7865   else if (! EQ (dst_object, Qt))
7866     CHECK_BUFFER (dst_object);
7867
7868   setup_coding_system (coding_system, &coding);
7869   coding.mode |= CODING_MODE_LAST_BLOCK;
7870   chars = SCHARS (string);
7871   bytes = SBYTES (string);
7872   if (encodep)
7873     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7874   else
7875     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7876   if (! norecord)
7877     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7878
7879   return (BUFFERP (dst_object)
7880           ? make_number (coding.produced_char)
7881           : coding.dst_object);
7882 }
7883
7884
7885 /* Encode or decode STRING according to CODING_SYSTEM.
7886    Do not set Vlast_coding_system_used.
7887
7888    This function is called only from macros DECODE_FILE and
7889    ENCODE_FILE, thus we ignore character composition.  */
7890
7891 Lisp_Object
7892 code_convert_string_norecord (string, coding_system, encodep)
7893      Lisp_Object string, coding_system;
7894      int encodep;
7895 {
7896   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7897 }
7898
7899
7900 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7901        2, 4, 0,
7902        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7903
7904 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7905 if the decoding operation is trivial.
7906
7907 Optional fourth arg BUFFER non-nil meant that the decoded text is
7908 inserted in BUFFER instead of returned as a string.  In this case,
7909 the return value is BUFFER.
7910
7911 This function sets `last-coding-system-used' to the precise coding system
7912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7913 not fully specified.  */)
7914   (string, coding_system, nocopy, buffer)
7915      Lisp_Object string, coding_system, nocopy, buffer;
7916 {
7917   return code_convert_string (string, coding_system, buffer,
7918                               0, ! NILP (nocopy), 0);
7919 }
7920
7921 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7922        2, 4, 0,
7923        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7924
7925 Optional third arg NOCOPY non-nil means it is OK to return STRING
7926 itself if the encoding operation is trivial.
7927
7928 Optional fourth arg BUFFER non-nil meant that the encoded text is
7929 inserted in BUFFER instead of returned as a string.  In this case,
7930 the return value is BUFFER.
7931
7932 This function sets `last-coding-system-used' to the precise coding system
7933 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7934 not fully specified.)  */)
7935      (string, coding_system, nocopy, buffer)
7936      Lisp_Object string, coding_system, nocopy, buffer;
7937 {
7938   return code_convert_string (string, coding_system, buffer,
7939                               1, ! NILP (nocopy), 1);
7940 }
7941
7942 \f
7943 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7944        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7945 Return the corresponding character.  */)
7946      (code)
7947      Lisp_Object code;
7948 {
7949   Lisp_Object spec, attrs, val;
7950   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7951   int c;
7952
7953   CHECK_NATNUM (code);
7954   c = XFASTINT (code);
7955   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7956   attrs = AREF (spec, 0);
7957
7958   if (ASCII_BYTE_P (c)
7959       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7960     return code;
7961
7962   val = CODING_ATTR_CHARSET_LIST (attrs);
7963   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7964   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7965   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7966
7967   if (c <= 0x7F)
7968     charset = charset_roman;
7969   else if (c >= 0xA0 && c < 0xDF)
7970     {
7971       charset = charset_kana;
7972       c -= 0x80;
7973     }
7974   else
7975     {
7976       int s1 = c >> 8, s2 = c & 0xFF;
7977
7978       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7979           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7980         error ("Invalid code: %d", code);
7981       SJIS_TO_JIS (c);
7982       charset = charset_kanji;
7983     }
7984   c = DECODE_CHAR (charset, c);
7985   if (c < 0)
7986     error ("Invalid code: %d", code);
7987   return make_number (c);
7988 }
7989
7990
7991 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7992        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7993 Return the corresponding code in SJIS.  */)
7994      (ch)
7995     Lisp_Object ch;
7996 {
7997   Lisp_Object spec, attrs, charset_list;
7998   int c;
7999   struct charset *charset;
8000   unsigned code;
8001
8002   CHECK_CHARACTER (ch);
8003   c = XFASTINT (ch);
8004   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
8005   attrs = AREF (spec, 0);
8006
8007   if (ASCII_CHAR_P (c)
8008       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8009     return ch;
8010
8011   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8012   charset = char_charset (c, charset_list, &code);
8013   if (code == CHARSET_INVALID_CODE (charset))
8014     error ("Can't encode by shift_jis encoding: %d", c);
8015   JIS_TO_SJIS (code);
8016
8017   return make_number (code);
8018 }
8019
8020 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
8021        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
8022 Return the corresponding character.  */)
8023      (code)
8024      Lisp_Object code;
8025 {
8026   Lisp_Object spec, attrs, val;
8027   struct charset *charset_roman, *charset_big5, *charset;
8028   int c;
8029
8030   CHECK_NATNUM (code);
8031   c = XFASTINT (code);
8032   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8033   attrs = AREF (spec, 0);
8034
8035   if (ASCII_BYTE_P (c)
8036       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8037     return code;
8038
8039   val = CODING_ATTR_CHARSET_LIST (attrs);
8040   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
8041   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
8042
8043   if (c <= 0x7F)
8044     charset = charset_roman;
8045   else
8046     {
8047       int b1 = c >> 8, b2 = c & 0x7F;
8048       if (b1 < 0xA1 || b1 > 0xFE
8049           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
8050         error ("Invalid code: %d", code);
8051       charset = charset_big5;
8052     }
8053   c = DECODE_CHAR (charset, (unsigned )c);
8054   if (c < 0)
8055     error ("Invalid code: %d", code);
8056   return make_number (c);
8057 }
8058
8059 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
8060        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
8061 Return the corresponding character code in Big5.  */)
8062      (ch)
8063      Lisp_Object ch;
8064 {
8065   Lisp_Object spec, attrs, charset_list;
8066   struct charset *charset;
8067   int c;
8068   unsigned code;
8069
8070   CHECK_CHARACTER (ch);
8071   c = XFASTINT (ch);
8072   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
8073   attrs = AREF (spec, 0);
8074   if (ASCII_CHAR_P (c)
8075       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
8076     return ch;
8077
8078   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
8079   charset = char_charset (c, charset_list, &code);
8080   if (code == CHARSET_INVALID_CODE (charset))
8081     error ("Can't encode by Big5 encoding: %d", c);
8082
8083   return make_number (code);
8084 }
8085
8086 \f
8087 DEFUN ("set-terminal-coding-system-internal",
8088        Fset_terminal_coding_system_internal,
8089        Sset_terminal_coding_system_internal, 1, 1, 0,
8090        doc: /* Internal use only.  */)
8091      (coding_system)
8092      Lisp_Object coding_system;
8093 {
8094   CHECK_SYMBOL (coding_system);
8095   setup_coding_system (Fcheck_coding_system (coding_system),
8096                         &terminal_coding);
8097
8098   /* We had better not send unsafe characters to terminal.  */
8099   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
8100   /* Characer composition should be disabled.  */
8101   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8102   terminal_coding.src_multibyte = 1;
8103   terminal_coding.dst_multibyte = 0;
8104   return Qnil;
8105 }
8106
8107 DEFUN ("set-safe-terminal-coding-system-internal",
8108        Fset_safe_terminal_coding_system_internal,
8109        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
8110        doc: /* Internal use only.  */)
8111      (coding_system)
8112      Lisp_Object coding_system;
8113 {
8114   CHECK_SYMBOL (coding_system);
8115   setup_coding_system (Fcheck_coding_system (coding_system),
8116                        &safe_terminal_coding);
8117   /* Characer composition should be disabled.  */
8118   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8119   safe_terminal_coding.src_multibyte = 1;
8120   safe_terminal_coding.dst_multibyte = 0;
8121   return Qnil;
8122 }
8123
8124 DEFUN ("terminal-coding-system",
8125        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
8126        doc: /* Return coding system specified for terminal output.  */)
8127      ()
8128 {
8129   return CODING_ID_NAME (terminal_coding.id);
8130 }
8131
8132 DEFUN ("set-keyboard-coding-system-internal",
8133        Fset_keyboard_coding_system_internal,
8134        Sset_keyboard_coding_system_internal, 1, 1, 0,
8135        doc: /* Internal use only.  */)
8136      (coding_system)
8137      Lisp_Object coding_system;
8138 {
8139   CHECK_SYMBOL (coding_system);
8140   setup_coding_system (Fcheck_coding_system (coding_system),
8141                        &keyboard_coding);
8142   /* Characer composition should be disabled.  */
8143   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
8144   return Qnil;
8145 }
8146
8147 DEFUN ("keyboard-coding-system",
8148        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
8149        doc: /* Return coding system specified for decoding keyboard input.  */)
8150      ()
8151 {
8152   return CODING_ID_NAME (keyboard_coding.id);
8153 }
8154
8155 \f
8156 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
8157        Sfind_operation_coding_system,  1, MANY, 0,
8158        doc: /* Choose a coding system for an operation based on the target name.
8159 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
8160 DECODING-SYSTEM is the coding system to use for decoding
8161 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
8162 for encoding (in case OPERATION does encoding).
8163
8164 The first argument OPERATION specifies an I/O primitive:
8165   For file I/O, `insert-file-contents' or `write-region'.
8166   For process I/O, `call-process', `call-process-region', or `start-process'.
8167   For network I/O, `open-network-stream'.
8168
8169 The remaining arguments should be the same arguments that were passed
8170 to the primitive.  Depending on which primitive, one of those arguments
8171 is selected as the TARGET.  For example, if OPERATION does file I/O,
8172 whichever argument specifies the file name is TARGET.
8173
8174 TARGET has a meaning which depends on OPERATION:
8175   For file I/O, TARGET is a file name.
8176   For process I/O, TARGET is a process name.
8177   For network I/O, TARGET is a service name or a port number
8178
8179 This function looks up what specified for TARGET in,
8180 `file-coding-system-alist', `process-coding-system-alist',
8181 or `network-coding-system-alist' depending on OPERATION.
8182 They may specify a coding system, a cons of coding systems,
8183 or a function symbol to call.
8184 In the last case, we call the function with one argument,
8185 which is a list of all the arguments given to this function.
8186
8187 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
8188      (nargs, args)
8189      int nargs;
8190      Lisp_Object *args;
8191 {
8192   Lisp_Object operation, target_idx, target, val;
8193   register Lisp_Object chain;
8194
8195   if (nargs < 2)
8196     error ("Too few arguments");
8197   operation = args[0];
8198   if (!SYMBOLP (operation)
8199       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
8200     error ("Invalid first arguement");
8201   if (nargs < 1 + XINT (target_idx))
8202     error ("Too few arguments for operation: %s",
8203            SDATA (SYMBOL_NAME (operation)));
8204   target = args[XINT (target_idx) + 1];
8205   if (!(STRINGP (target)
8206         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
8207     error ("Invalid %dth argument", XINT (target_idx) + 1);
8208
8209   chain = ((EQ (operation, Qinsert_file_contents)
8210             || EQ (operation, Qwrite_region))
8211            ? Vfile_coding_system_alist
8212            : (EQ (operation, Qopen_network_stream)
8213               ? Vnetwork_coding_system_alist
8214               : Vprocess_coding_system_alist));
8215   if (NILP (chain))
8216     return Qnil;
8217
8218   for (; CONSP (chain); chain = XCDR (chain))
8219     {
8220       Lisp_Object elt;
8221
8222       elt = XCAR (chain);
8223       if (CONSP (elt)
8224           && ((STRINGP (target)
8225                && STRINGP (XCAR (elt))
8226                && fast_string_match (XCAR (elt), target) >= 0)
8227               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
8228         {
8229           val = XCDR (elt);
8230           /* Here, if VAL is both a valid coding system and a valid
8231              function symbol, we return VAL as a coding system.  */
8232           if (CONSP (val))
8233             return val;
8234           if (! SYMBOLP (val))
8235             return Qnil;
8236           if (! NILP (Fcoding_system_p (val)))
8237             return Fcons (val, val);
8238           if (! NILP (Ffboundp (val)))
8239             {
8240               val = call1 (val, Flist (nargs, args));
8241               if (CONSP (val))
8242                 return val;
8243               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
8244                 return Fcons (val, val);
8245             }
8246           return Qnil;
8247         }
8248     }
8249   return Qnil;
8250 }
8251
8252 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
8253        Sset_coding_system_priority, 0, MANY, 0,
8254        doc: /* Assign higher priority to the coding systems given as arguments.
8255 If multiple coding systems belongs to the same category,
8256 all but the first one are ignored.
8257
8258 usage: (set-coding-system-priority ...)  */)
8259      (nargs, args)
8260      int nargs;
8261      Lisp_Object *args;
8262 {
8263   int i, j;
8264   int changed[coding_category_max];
8265   enum coding_category priorities[coding_category_max];
8266
8267   bzero (changed, sizeof changed);
8268
8269   for (i = j = 0; i < nargs; i++)
8270     {
8271       enum coding_category category;
8272       Lisp_Object spec, attrs;
8273
8274       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
8275       attrs = AREF (spec, 0);
8276       category = XINT (CODING_ATTR_CATEGORY (attrs));
8277       if (changed[category])
8278         /* Ignore this coding system because a coding system of the
8279            same category already had a higher priority.  */
8280         continue;
8281       changed[category] = 1;
8282       priorities[j++] = category;
8283       if (coding_categories[category].id >= 0
8284           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
8285         setup_coding_system (args[i], &coding_categories[category]);
8286       Fset (AREF (Vcoding_category_table, category), args[i]);
8287     }
8288
8289   /* Now we have decided top J priorities.  Reflect the order of the
8290      original priorities to the remaining priorities.  */
8291
8292   for (i = j, j = 0; i < coding_category_max; i++, j++)
8293     {
8294       while (j < coding_category_max
8295              && changed[coding_priorities[j]])
8296         j++;
8297       if (j == coding_category_max)
8298         abort ();
8299       priorities[i] = coding_priorities[j];
8300     }
8301
8302   bcopy (priorities, coding_priorities, sizeof priorities);
8303
8304   /* Update `coding-category-list'.  */
8305   Vcoding_category_list = Qnil;
8306   for (i = coding_category_max - 1; i >= 0; i--)
8307     Vcoding_category_list
8308       = Fcons (AREF (Vcoding_category_table, priorities[i]),
8309                Vcoding_category_list);
8310
8311   return Qnil;
8312 }
8313
8314 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
8315        Scoding_system_priority_list, 0, 1, 0,
8316        doc: /* Return a list of coding systems ordered by their priorities.
8317 HIGHESTP non-nil means just return the highest priority one.  */)
8318      (highestp)
8319      Lisp_Object highestp;
8320 {
8321   int i;
8322   Lisp_Object val;
8323
8324   for (i = 0, val = Qnil; i < coding_category_max; i++)
8325     {
8326       enum coding_category category = coding_priorities[i];
8327       int id = coding_categories[category].id;
8328       Lisp_Object attrs;
8329
8330       if (id < 0)
8331         continue;
8332       attrs = CODING_ID_ATTRS (id);
8333       if (! NILP (highestp))
8334         return CODING_ATTR_BASE_NAME (attrs);
8335       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
8336     }
8337   return Fnreverse (val);
8338 }
8339
8340 static char *suffixes[] = { "-unix", "-dos", "-mac" };
8341
8342 static Lisp_Object
8343 make_subsidiaries (base)
8344      Lisp_Object base;
8345 {
8346   Lisp_Object subsidiaries;
8347   int base_name_len = SBYTES (SYMBOL_NAME (base));
8348   char *buf = (char *) alloca (base_name_len + 6);
8349   int i;
8350
8351   bcopy (SDATA (SYMBOL_NAME (base)), buf, base_name_len);
8352   subsidiaries = Fmake_vector (make_number (3), Qnil);
8353   for (i = 0; i < 3; i++)
8354     {
8355       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
8356       ASET (subsidiaries, i, intern (buf));
8357     }
8358   return subsidiaries;
8359 }
8360
8361
8362 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
8363        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
8364        doc: /* For internal use only.
8365 usage: (define-coding-system-internal ...)  */)
8366      (nargs, args)
8367      int nargs;
8368      Lisp_Object *args;
8369 {
8370   Lisp_Object name;
8371   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
8372   Lisp_Object attrs;            /* Vector of attributes.  */
8373   Lisp_Object eol_type;
8374   Lisp_Object aliases;
8375   Lisp_Object coding_type, charset_list, safe_charsets;
8376   enum coding_category category;
8377   Lisp_Object tail, val;
8378   int max_charset_id = 0;
8379   int i;
8380
8381   if (nargs < coding_arg_max)
8382     goto short_args;
8383
8384   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
8385
8386   name = args[coding_arg_name];
8387   CHECK_SYMBOL (name);
8388   CODING_ATTR_BASE_NAME (attrs) = name;
8389
8390   val = args[coding_arg_mnemonic];
8391   if (! STRINGP (val))
8392     CHECK_CHARACTER (val);
8393   CODING_ATTR_MNEMONIC (attrs) = val;
8394
8395   coding_type = args[coding_arg_coding_type];
8396   CHECK_SYMBOL (coding_type);
8397   CODING_ATTR_TYPE (attrs) = coding_type;
8398
8399   charset_list = args[coding_arg_charset_list];
8400   if (SYMBOLP (charset_list))
8401     {
8402       if (EQ (charset_list, Qiso_2022))
8403         {
8404           if (! EQ (coding_type, Qiso_2022))
8405             error ("Invalid charset-list");
8406           charset_list = Viso_2022_charset_list;
8407         }
8408       else if (EQ (charset_list, Qemacs_mule))
8409         {
8410           if (! EQ (coding_type, Qemacs_mule))
8411             error ("Invalid charset-list");
8412           charset_list = Vemacs_mule_charset_list;
8413         }
8414       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8415         if (max_charset_id < XFASTINT (XCAR (tail)))
8416           max_charset_id = XFASTINT (XCAR (tail));
8417     }
8418   else
8419     {
8420       charset_list = Fcopy_sequence (charset_list);
8421       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
8422         {
8423           struct charset *charset;
8424
8425           val = Fcar (tail);
8426           CHECK_CHARSET_GET_CHARSET (val, charset);
8427           if (EQ (coding_type, Qiso_2022)
8428               ? CHARSET_ISO_FINAL (charset) < 0
8429               : EQ (coding_type, Qemacs_mule)
8430               ? CHARSET_EMACS_MULE_ID (charset) < 0
8431               : 0)
8432             error ("Can't handle charset `%s'",
8433                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8434
8435           XSETCAR (tail, make_number (charset->id));
8436           if (max_charset_id < charset->id)
8437             max_charset_id = charset->id;
8438         }
8439     }
8440   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
8441
8442   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
8443                                 make_number (255));
8444   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8445     SSET (safe_charsets, XFASTINT (XCAR (tail)), 0);
8446   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
8447
8448   CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
8449
8450   val = args[coding_arg_decode_translation_table];
8451   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8452     CHECK_SYMBOL (val);
8453   CODING_ATTR_DECODE_TBL (attrs) = val;
8454
8455   val = args[coding_arg_encode_translation_table];
8456   if (! CHAR_TABLE_P (val) && ! CONSP (val))
8457     CHECK_SYMBOL (val);
8458   CODING_ATTR_ENCODE_TBL (attrs) = val;
8459
8460   val = args[coding_arg_post_read_conversion];
8461   CHECK_SYMBOL (val);
8462   CODING_ATTR_POST_READ (attrs) = val;
8463
8464   val = args[coding_arg_pre_write_conversion];
8465   CHECK_SYMBOL (val);
8466   CODING_ATTR_PRE_WRITE (attrs) = val;
8467
8468   val = args[coding_arg_default_char];
8469   if (NILP (val))
8470     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
8471   else
8472     {
8473       CHECK_CHARACTER (val);
8474       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8475     }
8476
8477   val = args[coding_arg_for_unibyte];
8478   CODING_ATTR_FOR_UNIBYTE (attrs) = NILP (val) ? Qnil : Qt;
8479
8480   val = args[coding_arg_plist];
8481   CHECK_LIST (val);
8482   CODING_ATTR_PLIST (attrs) = val;
8483
8484   if (EQ (coding_type, Qcharset))
8485     {
8486       /* Generate a lisp vector of 256 elements.  Each element is nil,
8487          integer, or a list of charset IDs.
8488
8489          If Nth element is nil, the byte code N is invalid in this
8490          coding system.
8491
8492          If Nth element is a number NUM, N is the first byte of a
8493          charset whose ID is NUM.
8494
8495          If Nth element is a list of charset IDs, N is the first byte
8496          of one of them.  The list is sorted by dimensions of the
8497          charsets.  A charset of smaller dimension comes firtst. */
8498       val = Fmake_vector (make_number (256), Qnil);
8499
8500       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
8501         {
8502           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8503           int dim = CHARSET_DIMENSION (charset);
8504           int idx = (dim - 1) * 4;
8505
8506           if (CHARSET_ASCII_COMPATIBLE_P (charset))
8507             CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8508
8509           for (i = charset->code_space[idx];
8510                i <= charset->code_space[idx + 1]; i++)
8511             {
8512               Lisp_Object tmp, tmp2;
8513               int dim2;
8514
8515               tmp = AREF (val, i);
8516               if (NILP (tmp))
8517                 tmp = XCAR (tail);
8518               else if (NUMBERP (tmp))
8519                 {
8520                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8521                   if (dim < dim2)
8522                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8523                   else
8524                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8525                 }
8526               else
8527                 {
8528                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8529                     {
8530                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8531                       if (dim < dim2)
8532                         break;
8533                     }
8534                   if (NILP (tmp2))
8535                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8536                   else
8537                     {
8538                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8539                       XSETCAR (tmp2, XCAR (tail));
8540                     }
8541                 }
8542               ASET (val, i, tmp);
8543             }
8544         }
8545       ASET (attrs, coding_attr_charset_valids, val);
8546       category = coding_category_charset;
8547     }
8548   else if (EQ (coding_type, Qccl))
8549     {
8550       Lisp_Object valids;
8551
8552       if (nargs < coding_arg_ccl_max)
8553         goto short_args;
8554
8555       val = args[coding_arg_ccl_decoder];
8556       CHECK_CCL_PROGRAM (val);
8557       if (VECTORP (val))
8558         val = Fcopy_sequence (val);
8559       ASET (attrs, coding_attr_ccl_decoder, val);
8560
8561       val = args[coding_arg_ccl_encoder];
8562       CHECK_CCL_PROGRAM (val);
8563       if (VECTORP (val))
8564         val = Fcopy_sequence (val);
8565       ASET (attrs, coding_attr_ccl_encoder, val);
8566
8567       val = args[coding_arg_ccl_valids];
8568       valids = Fmake_string (make_number (256), make_number (0));
8569       for (tail = val; !NILP (tail); tail = Fcdr (tail))
8570         {
8571           int from, to;
8572
8573           val = Fcar (tail);
8574           if (INTEGERP (val))
8575             {
8576               from = to = XINT (val);
8577               if (from < 0 || from > 255)
8578                 args_out_of_range_3 (val, make_number (0), make_number (255));
8579             }
8580           else
8581             {
8582               CHECK_CONS (val);
8583               CHECK_NATNUM_CAR (val);
8584               CHECK_NATNUM_CDR (val);
8585               from = XINT (XCAR (val));
8586               if (from > 255)
8587                 args_out_of_range_3 (XCAR (val),
8588                                      make_number (0), make_number (255));
8589               to = XINT (XCDR (val));
8590               if (to < from || to > 255)
8591                 args_out_of_range_3 (XCDR (val),
8592                                      XCAR (val), make_number (255));
8593             }
8594           for (i = from; i <= to; i++)
8595             SSET (valids, i, 1);
8596         }
8597       ASET (attrs, coding_attr_ccl_valids, valids);
8598
8599       category = coding_category_ccl;
8600     }
8601   else if (EQ (coding_type, Qutf_16))
8602     {
8603       Lisp_Object bom, endian;
8604
8605       CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8606
8607       if (nargs < coding_arg_utf16_max)
8608         goto short_args;
8609
8610       bom = args[coding_arg_utf16_bom];
8611       if (! NILP (bom) && ! EQ (bom, Qt))
8612         {
8613           CHECK_CONS (bom);
8614           val = XCAR (bom);
8615           CHECK_CODING_SYSTEM (val);
8616           val = XCDR (bom);
8617           CHECK_CODING_SYSTEM (val);
8618         }
8619       ASET (attrs, coding_attr_utf_16_bom, bom);
8620
8621       endian = args[coding_arg_utf16_endian];
8622       CHECK_SYMBOL (endian);
8623       if (NILP (endian))
8624         endian = Qbig;
8625       else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8626         error ("Invalid endian: %s", SDATA (SYMBOL_NAME (endian)));
8627       ASET (attrs, coding_attr_utf_16_endian, endian);
8628
8629       category = (CONSP (bom)
8630                   ? coding_category_utf_16_auto
8631                   : NILP (bom)
8632                   ? (EQ (endian, Qbig)
8633                      ? coding_category_utf_16_be_nosig
8634                      : coding_category_utf_16_le_nosig)
8635                   : (EQ (endian, Qbig)
8636                      ? coding_category_utf_16_be
8637                      : coding_category_utf_16_le));
8638     }
8639   else if (EQ (coding_type, Qiso_2022))
8640     {
8641       Lisp_Object initial, reg_usage, request, flags;
8642       int i;
8643
8644       if (nargs < coding_arg_iso2022_max)
8645         goto short_args;
8646
8647       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8648       CHECK_VECTOR (initial);
8649       for (i = 0; i < 4; i++)
8650         {
8651           val = Faref (initial, make_number (i));
8652           if (! NILP (val))
8653             {
8654               struct charset *charset;
8655
8656               CHECK_CHARSET_GET_CHARSET (val, charset);
8657               ASET (initial, i, make_number (CHARSET_ID (charset)));
8658               if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8659                 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8660             }
8661           else
8662             ASET (initial, i, make_number (-1));
8663         }
8664
8665       reg_usage = args[coding_arg_iso2022_reg_usage];
8666       CHECK_CONS (reg_usage);
8667       CHECK_NUMBER_CAR (reg_usage);
8668       CHECK_NUMBER_CDR (reg_usage);
8669
8670       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8671       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8672         {
8673           int id;
8674           Lisp_Object tmp;
8675
8676           val = Fcar (tail);
8677           CHECK_CONS (val);
8678           tmp = XCAR (val);
8679           CHECK_CHARSET_GET_ID (tmp, id);
8680           CHECK_NATNUM_CDR (val);
8681           if (XINT (XCDR (val)) >= 4)
8682             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8683           XSETCAR (val, make_number (id));
8684         }
8685
8686       flags = args[coding_arg_iso2022_flags];
8687       CHECK_NATNUM (flags);
8688       i = XINT (flags);
8689       if (EQ (args[coding_arg_charset_list], Qiso_2022))
8690         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8691
8692       ASET (attrs, coding_attr_iso_initial, initial);
8693       ASET (attrs, coding_attr_iso_usage, reg_usage);
8694       ASET (attrs, coding_attr_iso_request, request);
8695       ASET (attrs, coding_attr_iso_flags, flags);
8696       setup_iso_safe_charsets (attrs);
8697
8698       if (i & CODING_ISO_FLAG_SEVEN_BITS)
8699         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8700                           | CODING_ISO_FLAG_SINGLE_SHIFT))
8701                     ? coding_category_iso_7_else
8702                     : EQ (args[coding_arg_charset_list], Qiso_2022)
8703                     ? coding_category_iso_7
8704                     : coding_category_iso_7_tight);
8705       else
8706         {
8707           int id = XINT (AREF (initial, 1));
8708
8709           category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8710                        || EQ (args[coding_arg_charset_list], Qiso_2022)
8711                        || id < 0)
8712                       ? coding_category_iso_8_else
8713                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8714                       ? coding_category_iso_8_1
8715                       : coding_category_iso_8_2);
8716         }
8717       if (category != coding_category_iso_8_1
8718           && category != coding_category_iso_8_2)
8719         CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8720     }
8721   else if (EQ (coding_type, Qemacs_mule))
8722     {
8723       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8724         ASET (attrs, coding_attr_emacs_mule_full, Qt);
8725       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8726       category = coding_category_emacs_mule;
8727     }
8728   else if (EQ (coding_type, Qshift_jis))
8729     {
8730
8731       struct charset *charset;
8732
8733       if (XINT (Flength (charset_list)) != 3
8734           && XINT (Flength (charset_list)) != 4)
8735         error ("There should be three or four charsets");
8736
8737       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8738       if (CHARSET_DIMENSION (charset) != 1)
8739         error ("Dimension of charset %s is not one",
8740                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8741       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8742         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8743
8744       charset_list = XCDR (charset_list);
8745       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8746       if (CHARSET_DIMENSION (charset) != 1)
8747         error ("Dimension of charset %s is not one",
8748                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8749
8750       charset_list = XCDR (charset_list);
8751       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8752       if (CHARSET_DIMENSION (charset) != 2)
8753         error ("Dimension of charset %s is not two",
8754                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8755
8756       charset_list = XCDR (charset_list);
8757       if (! NILP (charset_list))
8758         {
8759           charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8760           if (CHARSET_DIMENSION (charset) != 2)
8761             error ("Dimension of charset %s is not two",
8762                    SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8763         }
8764
8765       category = coding_category_sjis;
8766       Vsjis_coding_system = name;
8767     }
8768   else if (EQ (coding_type, Qbig5))
8769     {
8770       struct charset *charset;
8771
8772       if (XINT (Flength (charset_list)) != 2)
8773         error ("There should be just two charsets");
8774
8775       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8776       if (CHARSET_DIMENSION (charset) != 1)
8777         error ("Dimension of charset %s is not one",
8778                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8779       if (CHARSET_ASCII_COMPATIBLE_P (charset))
8780         CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8781
8782       charset_list = XCDR (charset_list);
8783       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8784       if (CHARSET_DIMENSION (charset) != 2)
8785         error ("Dimension of charset %s is not two",
8786                SDATA (SYMBOL_NAME (CHARSET_NAME (charset))));
8787
8788       category = coding_category_big5;
8789       Vbig5_coding_system = name;
8790     }
8791   else if (EQ (coding_type, Qraw_text))
8792     {
8793       category = coding_category_raw_text;
8794       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8795     }
8796   else if (EQ (coding_type, Qutf_8))
8797     {
8798       category = coding_category_utf_8;
8799       CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8800     }
8801   else if (EQ (coding_type, Qundecided))
8802     category = coding_category_undecided;
8803   else
8804     error ("Invalid coding system type: %s",
8805            SDATA (SYMBOL_NAME (coding_type)));
8806
8807   CODING_ATTR_CATEGORY (attrs) = make_number (category);
8808   CODING_ATTR_PLIST (attrs)
8809     = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8810                                 CODING_ATTR_PLIST (attrs)));
8811
8812   eol_type = args[coding_arg_eol_type];
8813   if (! NILP (eol_type)
8814       && ! EQ (eol_type, Qunix)
8815       && ! EQ (eol_type, Qdos)
8816       && ! EQ (eol_type, Qmac))
8817     error ("Invalid eol-type");
8818
8819   aliases = Fcons (name, Qnil);
8820
8821   if (NILP (eol_type))
8822     {
8823       eol_type = make_subsidiaries (name);
8824       for (i = 0; i < 3; i++)
8825         {
8826           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8827
8828           this_name = AREF (eol_type, i);
8829           this_aliases = Fcons (this_name, Qnil);
8830           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8831           this_spec = Fmake_vector (make_number (3), attrs);
8832           ASET (this_spec, 1, this_aliases);
8833           ASET (this_spec, 2, this_eol_type);
8834           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8835           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8836           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8837                                         Vcoding_system_alist);
8838         }
8839     }
8840
8841   spec_vec = Fmake_vector (make_number (3), attrs);
8842   ASET (spec_vec, 1, aliases);
8843   ASET (spec_vec, 2, eol_type);
8844
8845   Fputhash (name, spec_vec, Vcoding_system_hash_table);
8846   Vcoding_system_list = Fcons (name, Vcoding_system_list);
8847   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8848                                 Vcoding_system_alist);
8849
8850   {
8851     int id = coding_categories[category].id;
8852
8853     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8854       setup_coding_system (name, &coding_categories[category]);
8855   }
8856
8857   return Qnil;
8858
8859  short_args:
8860   return Fsignal (Qwrong_number_of_arguments,
8861                   Fcons (intern ("define-coding-system-internal"),
8862                          make_number (nargs)));
8863 }
8864
8865
8866 DEFUN ("coding-system-put", Fcoding_system_put, Scoding_system_put,
8867        3, 3, 0,
8868        doc: /* Change value in CODING-SYSTEM's property list PROP to VAL.  */)
8869   (coding_system, prop, val)
8870      Lisp_Object coding_system, prop, val;
8871 {
8872   Lisp_Object spec, attrs;
8873
8874   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8875   attrs = AREF (spec, 0);
8876   if (EQ (prop, QCmnemonic))
8877     {
8878       if (! STRINGP (val))
8879         CHECK_CHARACTER (val);
8880       CODING_ATTR_MNEMONIC (attrs) = val;
8881     }
8882   else if (EQ (prop, QCdefalut_char))
8883     {
8884       if (NILP (val))
8885         val = make_number (' ');
8886       else
8887         CHECK_CHARACTER (val);
8888       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
8889     }
8890   else if (EQ (prop, QCdecode_translation_table))
8891     {
8892       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8893         CHECK_SYMBOL (val);
8894       CODING_ATTR_DECODE_TBL (attrs) = val;
8895     }
8896   else if (EQ (prop, QCencode_translation_table))
8897     {
8898       if (! CHAR_TABLE_P (val) && ! CONSP (val))
8899         CHECK_SYMBOL (val);
8900       CODING_ATTR_ENCODE_TBL (attrs) = val;
8901     }
8902   else if (EQ (prop, QCpost_read_conversion))
8903     {
8904       CHECK_SYMBOL (val);
8905       CODING_ATTR_POST_READ (attrs) = val;
8906     }
8907   else if (EQ (prop, QCpre_write_conversion))
8908     {
8909       CHECK_SYMBOL (val);
8910       CODING_ATTR_PRE_WRITE (attrs) = val;
8911     }
8912
8913   CODING_ATTR_PLIST (attrs)
8914     = Fplist_put (CODING_ATTR_PLIST (attrs), prop, val);
8915   return val;
8916 }
8917
8918
8919 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8920        Sdefine_coding_system_alias, 2, 2, 0,
8921        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
8922      (alias, coding_system)
8923      Lisp_Object alias, coding_system;
8924 {
8925   Lisp_Object spec, aliases, eol_type;
8926
8927   CHECK_SYMBOL (alias);
8928   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8929   aliases = AREF (spec, 1);
8930   /* ALISES should be a list of length more than zero, and the first
8931      element is a base coding system.  Append ALIAS at the tail of the
8932      list.  */
8933   while (!NILP (XCDR (aliases)))
8934     aliases = XCDR (aliases);
8935   XSETCDR (aliases, Fcons (alias, Qnil));
8936
8937   eol_type = AREF (spec, 2);
8938   if (VECTORP (eol_type))
8939     {
8940       Lisp_Object subsidiaries;
8941       int i;
8942
8943       subsidiaries = make_subsidiaries (alias);
8944       for (i = 0; i < 3; i++)
8945         Fdefine_coding_system_alias (AREF (subsidiaries, i),
8946                                      AREF (eol_type, i));
8947     }
8948
8949   Fputhash (alias, spec, Vcoding_system_hash_table);
8950   Vcoding_system_list = Fcons (alias, Vcoding_system_list);
8951   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8952                                 Vcoding_system_alist);
8953
8954   return Qnil;
8955 }
8956
8957 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8958        1, 1, 0,
8959        doc: /* Return the base of CODING-SYSTEM.
8960 Any alias or subsidiary coding system is not a base coding system.  */)
8961   (coding_system)
8962      Lisp_Object coding_system;
8963 {
8964   Lisp_Object spec, attrs;
8965
8966   if (NILP (coding_system))
8967     return (Qno_conversion);
8968   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8969   attrs = AREF (spec, 0);
8970   return CODING_ATTR_BASE_NAME (attrs);
8971 }
8972
8973 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8974        1, 1, 0,
8975        doc: "Return the property list of CODING-SYSTEM.")
8976      (coding_system)
8977      Lisp_Object coding_system;
8978 {
8979   Lisp_Object spec, attrs;
8980
8981   if (NILP (coding_system))
8982     coding_system = Qno_conversion;
8983   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8984   attrs = AREF (spec, 0);
8985   return CODING_ATTR_PLIST (attrs);
8986 }
8987
8988
8989 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8990        1, 1, 0,
8991        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
8992      (coding_system)
8993      Lisp_Object coding_system;
8994 {
8995   Lisp_Object spec;
8996
8997   if (NILP (coding_system))
8998     coding_system = Qno_conversion;
8999   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
9000   return AREF (spec, 1);
9001 }
9002
9003 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
9004        Scoding_system_eol_type, 1, 1, 0,
9005        doc: /* Return eol-type of CODING-SYSTEM.
9006 An eol-type is integer 0, 1, 2, or a vector of coding systems.
9007
9008 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
9009 and CR respectively.
9010
9011 A vector value indicates that a format of end-of-line should be
9012 detected automatically.  Nth element of the vector is the subsidiary
9013 coding system whose eol-type is N.  */)
9014      (coding_system)
9015      Lisp_Object coding_system;
9016 {
9017   Lisp_Object spec, eol_type;
9018   int n;
9019
9020   if (NILP (coding_system))
9021     coding_system = Qno_conversion;
9022   if (! CODING_SYSTEM_P (coding_system))
9023     return Qnil;
9024   spec = CODING_SYSTEM_SPEC (coding_system);
9025   eol_type = AREF (spec, 2);
9026   if (VECTORP (eol_type))
9027     return Fcopy_sequence (eol_type);
9028   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
9029   return make_number (n);
9030 }
9031
9032 #endif /* emacs */
9033
9034 \f
9035 /*** 9. Post-amble ***/
9036
9037 void
9038 init_coding_once ()
9039 {
9040   int i;
9041
9042   for (i = 0; i < coding_category_max; i++)
9043     {
9044       coding_categories[i].id = -1;
9045       coding_priorities[i] = i;
9046     }
9047
9048   /* ISO2022 specific initialize routine.  */
9049   for (i = 0; i < 0x20; i++)
9050     iso_code_class[i] = ISO_control_0;
9051   for (i = 0x21; i < 0x7F; i++)
9052     iso_code_class[i] = ISO_graphic_plane_0;
9053   for (i = 0x80; i < 0xA0; i++)
9054     iso_code_class[i] = ISO_control_1;
9055   for (i = 0xA1; i < 0xFF; i++)
9056     iso_code_class[i] = ISO_graphic_plane_1;
9057   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
9058   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
9059   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
9060   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
9061   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
9062   iso_code_class[ISO_CODE_ESC] = ISO_escape;
9063   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
9064   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
9065   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
9066
9067   for (i = 0; i < 256; i++)
9068     {
9069       emacs_mule_bytes[i] = 1;
9070     }
9071   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
9072   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
9073   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
9074   emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
9075 }
9076
9077 #ifdef emacs
9078
9079 void
9080 syms_of_coding ()
9081 {
9082   staticpro (&Vcoding_system_hash_table);
9083   {
9084     Lisp_Object args[2];
9085     args[0] = QCtest;
9086     args[1] = Qeq;
9087     Vcoding_system_hash_table = Fmake_hash_table (2, args);
9088   }
9089
9090   staticpro (&Vsjis_coding_system);
9091   Vsjis_coding_system = Qnil;
9092
9093   staticpro (&Vbig5_coding_system);
9094   Vbig5_coding_system = Qnil;
9095
9096   staticpro (&Vcode_conversion_reused_workbuf);
9097   Vcode_conversion_reused_workbuf = Qnil;
9098
9099   staticpro (&Vcode_conversion_workbuf_name);
9100   Vcode_conversion_workbuf_name = build_string (" *code-conversion-work*");
9101
9102   reused_workbuf_in_use = 0;
9103
9104   DEFSYM (Qcharset, "charset");
9105   DEFSYM (Qtarget_idx, "target-idx");
9106   DEFSYM (Qcoding_system_history, "coding-system-history");
9107   Fset (Qcoding_system_history, Qnil);
9108
9109   /* Target FILENAME is the first argument.  */
9110   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
9111   /* Target FILENAME is the third argument.  */
9112   Fput (Qwrite_region, Qtarget_idx, make_number (2));
9113
9114   DEFSYM (Qcall_process, "call-process");
9115   /* Target PROGRAM is the first argument.  */
9116   Fput (Qcall_process, Qtarget_idx, make_number (0));
9117
9118   DEFSYM (Qcall_process_region, "call-process-region");
9119   /* Target PROGRAM is the third argument.  */
9120   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
9121
9122   DEFSYM (Qstart_process, "start-process");
9123   /* Target PROGRAM is the third argument.  */
9124   Fput (Qstart_process, Qtarget_idx, make_number (2));
9125
9126   DEFSYM (Qopen_network_stream, "open-network-stream");
9127   /* Target SERVICE is the fourth argument.  */
9128   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
9129
9130   DEFSYM (Qcoding_system, "coding-system");
9131   DEFSYM (Qcoding_aliases, "coding-aliases");
9132
9133   DEFSYM (Qeol_type, "eol-type");
9134   DEFSYM (Qunix, "unix");
9135   DEFSYM (Qdos, "dos");
9136
9137   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
9138   DEFSYM (Qpost_read_conversion, "post-read-conversion");
9139   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
9140   DEFSYM (Qdefault_char, "default-char");
9141   DEFSYM (Qundecided, "undecided");
9142   DEFSYM (Qno_conversion, "no-conversion");
9143   DEFSYM (Qraw_text, "raw-text");
9144
9145   DEFSYM (Qiso_2022, "iso-2022");
9146
9147   DEFSYM (Qutf_8, "utf-8");
9148   DEFSYM (Qutf_8_emacs, "utf-8-emacs");
9149
9150   DEFSYM (Qutf_16, "utf-16");
9151   DEFSYM (Qbig, "big");
9152   DEFSYM (Qlittle, "little");
9153
9154   DEFSYM (Qshift_jis, "shift-jis");
9155   DEFSYM (Qbig5, "big5");
9156
9157   DEFSYM (Qcoding_system_p, "coding-system-p");
9158
9159   DEFSYM (Qcoding_system_error, "coding-system-error");
9160   Fput (Qcoding_system_error, Qerror_conditions,
9161         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
9162   Fput (Qcoding_system_error, Qerror_message,
9163         build_string ("Invalid coding system"));
9164
9165   /* Intern this now in case it isn't already done.
9166      Setting this variable twice is harmless.
9167      But don't staticpro it here--that is done in alloc.c.  */
9168   Qchar_table_extra_slots = intern ("char-table-extra-slots");
9169
9170   DEFSYM (Qtranslation_table, "translation-table");
9171   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2));
9172   DEFSYM (Qtranslation_table_id, "translation-table-id");
9173   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
9174   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
9175
9176   DEFSYM (Qvalid_codes, "valid-codes");
9177
9178   DEFSYM (Qemacs_mule, "emacs-mule");
9179
9180   DEFSYM (QCcategory, ":category");
9181   DEFSYM (QCmnemonic, ":mnemonic");
9182   DEFSYM (QCdefalut_char, ":default-char");
9183   DEFSYM (QCdecode_translation_table, ":decode-translation-table");
9184   DEFSYM (QCencode_translation_table, ":encode-translation-table");
9185   DEFSYM (QCpost_read_conversion, ":post-read-conversion");
9186   DEFSYM (QCpre_write_conversion, ":pre-write-conversion");
9187
9188   Vcoding_category_table
9189     = Fmake_vector (make_number (coding_category_max), Qnil);
9190   staticpro (&Vcoding_category_table);
9191   /* Followings are target of code detection.  */
9192   ASET (Vcoding_category_table, coding_category_iso_7,
9193         intern ("coding-category-iso-7"));
9194   ASET (Vcoding_category_table, coding_category_iso_7_tight,
9195         intern ("coding-category-iso-7-tight"));
9196   ASET (Vcoding_category_table, coding_category_iso_8_1,
9197         intern ("coding-category-iso-8-1"));
9198   ASET (Vcoding_category_table, coding_category_iso_8_2,
9199         intern ("coding-category-iso-8-2"));
9200   ASET (Vcoding_category_table, coding_category_iso_7_else,
9201         intern ("coding-category-iso-7-else"));
9202   ASET (Vcoding_category_table, coding_category_iso_8_else,
9203         intern ("coding-category-iso-8-else"));
9204   ASET (Vcoding_category_table, coding_category_utf_8,
9205         intern ("coding-category-utf-8"));
9206   ASET (Vcoding_category_table, coding_category_utf_16_be,
9207         intern ("coding-category-utf-16-be"));
9208   ASET (Vcoding_category_table, coding_category_utf_16_auto,
9209         intern ("coding-category-utf-16-auto"));
9210   ASET (Vcoding_category_table, coding_category_utf_16_le,
9211         intern ("coding-category-utf-16-le"));
9212   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
9213         intern ("coding-category-utf-16-be-nosig"));
9214   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
9215         intern ("coding-category-utf-16-le-nosig"));
9216   ASET (Vcoding_category_table, coding_category_charset,
9217         intern ("coding-category-charset"));
9218   ASET (Vcoding_category_table, coding_category_sjis,
9219         intern ("coding-category-sjis"));
9220   ASET (Vcoding_category_table, coding_category_big5,
9221         intern ("coding-category-big5"));
9222   ASET (Vcoding_category_table, coding_category_ccl,
9223         intern ("coding-category-ccl"));
9224   ASET (Vcoding_category_table, coding_category_emacs_mule,
9225         intern ("coding-category-emacs-mule"));
9226   /* Followings are NOT target of code detection.  */
9227   ASET (Vcoding_category_table, coding_category_raw_text,
9228         intern ("coding-category-raw-text"));
9229   ASET (Vcoding_category_table, coding_category_undecided,
9230         intern ("coding-category-undecided"));
9231
9232   DEFSYM (Qinsufficient_source, "insufficient-source");
9233   DEFSYM (Qinconsistent_eol, "inconsistent-eol");
9234   DEFSYM (Qinvalid_source, "invalid-source");
9235   DEFSYM (Qinterrupted, "interrupted");
9236   DEFSYM (Qinsufficient_memory, "insufficient-memory");
9237
9238   defsubr (&Scoding_system_p);
9239   defsubr (&Sread_coding_system);
9240   defsubr (&Sread_non_nil_coding_system);
9241   defsubr (&Scheck_coding_system);
9242   defsubr (&Sdetect_coding_region);
9243   defsubr (&Sdetect_coding_string);
9244   defsubr (&Sfind_coding_systems_region_internal);
9245   defsubr (&Sunencodable_char_position);
9246   defsubr (&Scheck_coding_systems_region);
9247   defsubr (&Sdecode_coding_region);
9248   defsubr (&Sencode_coding_region);
9249   defsubr (&Sdecode_coding_string);
9250   defsubr (&Sencode_coding_string);
9251   defsubr (&Sdecode_sjis_char);
9252   defsubr (&Sencode_sjis_char);
9253   defsubr (&Sdecode_big5_char);
9254   defsubr (&Sencode_big5_char);
9255   defsubr (&Sset_terminal_coding_system_internal);
9256   defsubr (&Sset_safe_terminal_coding_system_internal);
9257   defsubr (&Sterminal_coding_system);
9258   defsubr (&Sset_keyboard_coding_system_internal);
9259   defsubr (&Skeyboard_coding_system);
9260   defsubr (&Sfind_operation_coding_system);
9261   defsubr (&Sset_coding_system_priority);
9262   defsubr (&Sdefine_coding_system_internal);
9263   defsubr (&Sdefine_coding_system_alias);
9264   defsubr (&Scoding_system_put);
9265   defsubr (&Scoding_system_base);
9266   defsubr (&Scoding_system_plist);
9267   defsubr (&Scoding_system_aliases);
9268   defsubr (&Scoding_system_eol_type);
9269   defsubr (&Scoding_system_priority_list);
9270
9271   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
9272                doc: /* List of coding systems.
9273
9274 Do not alter the value of this variable manually.  This variable should be
9275 updated by the functions `define-coding-system' and
9276 `define-coding-system-alias'.  */);
9277   Vcoding_system_list = Qnil;
9278
9279   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
9280                doc: /* Alist of coding system names.
9281 Each element is one element list of coding system name.
9282 This variable is given to `completing-read' as TABLE argument.
9283
9284 Do not alter the value of this variable manually.  This variable should be
9285 updated by the functions `make-coding-system' and
9286 `define-coding-system-alias'.  */);
9287   Vcoding_system_alist = Qnil;
9288
9289   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
9290                doc: /* List of coding-categories (symbols) ordered by priority.
9291
9292 On detecting a coding system, Emacs tries code detection algorithms
9293 associated with each coding-category one by one in this order.  When
9294 one algorithm agrees with a byte sequence of source text, the coding
9295 system bound to the corresponding coding-category is selected.  */);
9296   {
9297     int i;
9298
9299     Vcoding_category_list = Qnil;
9300     for (i = coding_category_max - 1; i >= 0; i--)
9301       Vcoding_category_list
9302         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
9303                  Vcoding_category_list);
9304   }
9305
9306   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
9307                doc: /* Specify the coding system for read operations.
9308 It is useful to bind this variable with `let', but do not set it globally.
9309 If the value is a coding system, it is used for decoding on read operation.
9310 If not, an appropriate element is used from one of the coding system alists:
9311 There are three such tables, `file-coding-system-alist',
9312 `process-coding-system-alist', and `network-coding-system-alist'.  */);
9313   Vcoding_system_for_read = Qnil;
9314
9315   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
9316                doc: /* Specify the coding system for write operations.
9317 Programs bind this variable with `let', but you should not set it globally.
9318 If the value is a coding system, it is used for encoding of output,
9319 when writing it to a file and when sending it to a file or subprocess.
9320
9321 If this does not specify a coding system, an appropriate element
9322 is used from one of the coding system alists:
9323 There are three such tables, `file-coding-system-alist',
9324 `process-coding-system-alist', and `network-coding-system-alist'.
9325 For output to files, if the above procedure does not specify a coding system,
9326 the value of `buffer-file-coding-system' is used.  */);
9327   Vcoding_system_for_write = Qnil;
9328
9329   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
9330                doc: /*
9331 Coding system used in the latest file or process I/O.  */);
9332   Vlast_coding_system_used = Qnil;
9333
9334   DEFVAR_LISP ("last-code-conversion-error", &Vlast_code_conversion_error,
9335                doc: /*
9336 Error status of the last code conversion.
9337
9338 When an error was detected in the last code conversion, this variable
9339 is set to one of the following symbols.
9340   `insufficient-source'
9341   `inconsistent-eol'
9342   `invalid-source'
9343   `interrupted'
9344   `insufficient-memory'
9345 When no error was detected, the value doesn't change.  So, to check
9346 the error status of a code conversion by this variable, you must
9347 explicitly set this variable to nil before performing code
9348 conversion.  */);
9349   Vlast_code_conversion_error = Qnil;
9350
9351   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
9352                doc: /*
9353 *Non-nil means always inhibit code conversion of end-of-line format.
9354 See info node `Coding Systems' and info node `Text and Binary' concerning
9355 such conversion.  */);
9356   inhibit_eol_conversion = 0;
9357
9358   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
9359                doc: /*
9360 Non-nil means process buffer inherits coding system of process output.
9361 Bind it to t if the process output is to be treated as if it were a file
9362 read from some filesystem.  */);
9363   inherit_process_coding_system = 0;
9364
9365   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
9366                doc: /*
9367 Alist to decide a coding system to use for a file I/O operation.
9368 The format is ((PATTERN . VAL) ...),
9369 where PATTERN is a regular expression matching a file name,
9370 VAL is a coding system, a cons of coding systems, or a function symbol.
9371 If VAL is a coding system, it is used for both decoding and encoding
9372 the file contents.
9373 If VAL is a cons of coding systems, the car part is used for decoding,
9374 and the cdr part is used for encoding.
9375 If VAL is a function symbol, the function must return a coding system
9376 or a cons of coding systems which are used as above.  The function gets
9377 the arguments with which `find-operation-coding-systems' was called.
9378
9379 See also the function `find-operation-coding-system'
9380 and the variable `auto-coding-alist'.  */);
9381   Vfile_coding_system_alist = Qnil;
9382
9383   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
9384                doc: /*
9385 Alist to decide a coding system to use for a process I/O operation.
9386 The format is ((PATTERN . VAL) ...),
9387 where PATTERN is a regular expression matching a program name,
9388 VAL is a coding system, a cons of coding systems, or a function symbol.
9389 If VAL is a coding system, it is used for both decoding what received
9390 from the program and encoding what sent to the program.
9391 If VAL is a cons of coding systems, the car part is used for decoding,
9392 and the cdr part is used for encoding.
9393 If VAL is a function symbol, the function must return a coding system
9394 or a cons of coding systems which are used as above.
9395
9396 See also the function `find-operation-coding-system'.  */);
9397   Vprocess_coding_system_alist = Qnil;
9398
9399   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
9400                doc: /*
9401 Alist to decide a coding system to use for a network I/O operation.
9402 The format is ((PATTERN . VAL) ...),
9403 where PATTERN is a regular expression matching a network service name
9404 or is a port number to connect to,
9405 VAL is a coding system, a cons of coding systems, or a function symbol.
9406 If VAL is a coding system, it is used for both decoding what received
9407 from the network stream and encoding what sent to the network stream.
9408 If VAL is a cons of coding systems, the car part is used for decoding,
9409 and the cdr part is used for encoding.
9410 If VAL is a function symbol, the function must return a coding system
9411 or a cons of coding systems which are used as above.
9412
9413 See also the function `find-operation-coding-system'.  */);
9414   Vnetwork_coding_system_alist = Qnil;
9415
9416   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
9417                doc: /* Coding system to use with system messages.
9418 Also used for decoding keyboard input on X Window system.  */);
9419   Vlocale_coding_system = Qnil;
9420
9421   /* The eol mnemonics are reset in startup.el system-dependently.  */
9422   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
9423                doc: /*
9424 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
9425   eol_mnemonic_unix = build_string (":");
9426
9427   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
9428                doc: /*
9429 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
9430   eol_mnemonic_dos = build_string ("\\");
9431
9432   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
9433                doc: /*
9434 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
9435   eol_mnemonic_mac = build_string ("/");
9436
9437   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
9438                doc: /*
9439 *String displayed in mode line when end-of-line format is not yet determined.  */);
9440   eol_mnemonic_undecided = build_string (":");
9441
9442   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
9443                doc: /*
9444 *Non-nil enables character translation while encoding and decoding.  */);
9445   Venable_character_translation = Qt;
9446
9447   DEFVAR_LISP ("standard-translation-table-for-decode",
9448                &Vstandard_translation_table_for_decode,
9449                doc: /* Table for translating characters while decoding.  */);
9450   Vstandard_translation_table_for_decode = Qnil;
9451
9452   DEFVAR_LISP ("standard-translation-table-for-encode",
9453                &Vstandard_translation_table_for_encode,
9454                doc: /* Table for translating characters while encoding.  */);
9455   Vstandard_translation_table_for_encode = Qnil;
9456
9457   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
9458                doc: /* Alist of charsets vs revision numbers.
9459 While encoding, if a charset (car part of an element) is found,
9460 designate it with the escape sequence identifying revision (cdr part
9461 of the element).  */);
9462   Vcharset_revision_table = Qnil;
9463
9464   DEFVAR_LISP ("default-process-coding-system",
9465                &Vdefault_process_coding_system,
9466                doc: /* Cons of coding systems used for process I/O by default.
9467 The car part is used for decoding a process output,
9468 the cdr part is used for encoding a text to be sent to a process.  */);
9469   Vdefault_process_coding_system = Qnil;
9470
9471   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
9472                doc: /*
9473 Table of extra Latin codes in the range 128..159 (inclusive).
9474 This is a vector of length 256.
9475 If Nth element is non-nil, the existence of code N in a file
9476 \(or output of subprocess) doesn't prevent it to be detected as
9477 a coding system of ISO 2022 variant which has a flag
9478 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
9479 or reading output of a subprocess.
9480 Only 128th through 159th elements has a meaning.  */);
9481   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
9482
9483   DEFVAR_LISP ("select-safe-coding-system-function",
9484                &Vselect_safe_coding_system_function,
9485                doc: /*
9486 Function to call to select safe coding system for encoding a text.
9487
9488 If set, this function is called to force a user to select a proper
9489 coding system which can encode the text in the case that a default
9490 coding system used in each operation can't encode the text.
9491
9492 The default value is `select-safe-coding-system' (which see).  */);
9493   Vselect_safe_coding_system_function = Qnil;
9494
9495   DEFVAR_BOOL ("coding-system-require-warning",
9496                &coding_system_require_warning,
9497                doc: /* Internal use only.
9498 If non-nil, on writing a file, `select-safe-coding-system-function' is
9499 called even if `coding-system-for-write' is non-nil.  The command
9500 `universal-coding-system-argument' binds this variable to t temporarily.  */);
9501   coding_system_require_warning = 0;
9502
9503
9504   DEFVAR_BOOL ("inhibit-iso-escape-detection",
9505                &inhibit_iso_escape_detection,
9506                doc: /*
9507 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
9508
9509 By default, on reading a file, Emacs tries to detect how the text is
9510 encoded.  This code detection is sensitive to escape sequences.  If
9511 the sequence is valid as ISO2022, the code is determined as one of
9512 the ISO2022 encodings, and the file is decoded by the corresponding
9513 coding system (e.g. `iso-2022-7bit').
9514
9515 However, there may be a case that you want to read escape sequences in
9516 a file as is.  In such a case, you can set this variable to non-nil.
9517 Then, as the code detection ignores any escape sequences, no file is
9518 detected as encoded in some ISO2022 encoding.  The result is that all
9519 escape sequences become visible in a buffer.
9520
9521 The default value is nil, and it is strongly recommended not to change
9522 it.  That is because many Emacs Lisp source files that contain
9523 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
9524 in Emacs's distribution, and they won't be decoded correctly on
9525 reading if you suppress escape sequence detection.
9526
9527 The other way to read escape sequences in a file without decoding is
9528 to explicitly specify some coding system that doesn't use ISO2022's
9529 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
9530   inhibit_iso_escape_detection = 0;
9531
9532   DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input,
9533                doc: /* Char table for translating self-inserting characters.
9534 This is applied to the result of input methods, not their input.  See also
9535 `keyboard-translate-table'.  */);
9536     Vtranslation_table_for_input = Qnil;
9537
9538   {
9539     Lisp_Object args[coding_arg_max];
9540     Lisp_Object plist[16];
9541     int i;
9542
9543     for (i = 0; i < coding_arg_max; i++)
9544       args[i] = Qnil;
9545
9546     plist[0] = intern (":name");
9547     plist[1] = args[coding_arg_name] = Qno_conversion;
9548     plist[2] = intern (":mnemonic");
9549     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
9550     plist[4] = intern (":coding-type");
9551     plist[5] = args[coding_arg_coding_type] = Qraw_text;
9552     plist[6] = intern (":ascii-compatible-p");
9553     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
9554     plist[8] = intern (":default-char");
9555     plist[9] = args[coding_arg_default_char] = make_number (0);
9556     plist[10] = intern (":for-unibyte");
9557     plist[11] = args[coding_arg_for_unibyte] = Qt;
9558     plist[12] = intern (":docstring");
9559     plist[13] = build_string ("Do no conversion.\n\
9560 \n\
9561 When you visit a file with this coding, the file is read into a\n\
9562 unibyte buffer as is, thus each byte of a file is treated as a\n\
9563 character.");
9564     plist[14] = intern (":eol-type");
9565     plist[15] = args[coding_arg_eol_type] = Qunix;
9566     args[coding_arg_plist] = Flist (16, plist);
9567     Fdefine_coding_system_internal (coding_arg_max, args);
9568   }
9569
9570   setup_coding_system (Qno_conversion, &keyboard_coding);
9571   setup_coding_system (Qno_conversion, &terminal_coding);
9572   setup_coding_system (Qno_conversion, &safe_terminal_coding);
9573
9574   {
9575     int i;
9576
9577     for (i = 0; i < coding_category_max; i++)
9578       Fset (AREF (Vcoding_category_table, i), Qno_conversion);
9579   }
9580 }
9581
9582 char *
9583 emacs_strerror (error_number)
9584      int error_number;
9585 {
9586   char *str;
9587
9588   synchronize_system_messages_locale ();
9589   str = strerror (error_number);
9590
9591   if (! NILP (Vlocale_coding_system))
9592     {
9593       Lisp_Object dec = code_convert_string_norecord (build_string (str),
9594                                                       Vlocale_coding_system,
9595                                                       0);
9596       str = (char *) SDATA (dec);
9597     }
9598
9599   return str;
9600 }
9601
9602 #endif /* emacs */
9603
9604 /* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d
9605    (do not change this comment) */