src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for a text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX.  Return 1 if the data contains
 148   a byte sequence which can be decoded into non-ASCII characters by
 149   the coding system.  Otherwize (i.e. the data contains only ASCII
 150   characters or invalid sequence) return 0.
 151
 152   It also resets some bits of an integer pointed by MASK.  The macros
 153   CATEGORY_MASK_XXX specifies each bit of this integer.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, mask)
 160      struct coding_system *coding;
 161      int *mask;
 162 {
 163   unsigned char *src = coding->source;
 164   unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int c;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175       /* Check if it conforms to XXX.  If not, break the loop.  */
 176     }
 177   /* As the data is invalid for XXX, reset a proper bits.  */
 178   *mask &= ~CODING_CATEGORY_XXX;
 179   return 0;
 180  no_more_source:
 181   /* The source exausted.  */
 182   if (!found)
 183     /* ASCII characters only. */
 184     return 0;
 185   /* Some data should be decoded into non-ASCII characters.  */
 186   *mask &= CODING_CATEGORY_XXX;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   unsigned char *src = coding->source + coding->consumed;
 211   unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf;
 218   int *charbuf_end = charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
 314 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Vselect_safe_coding_system_function;
 324
 325 /* Mnemonic string for each format of end-of-line.  */
 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 327 /* Mnemonic string to indicate format of end-of-line is not yet
 328    decided.  */
 329 Lisp_Object eol_mnemonic_undecided;
 330
 331 #ifdef emacs
 332
 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 334
 335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 336
 337 /* Coding system emacs-mule and raw-text are for converting only
 338    end-of-line format.  */
 339 Lisp_Object Qemacs_mule, Qraw_text;
 340
 341 /* Coding-systems are handed between Emacs Lisp programs and C internal
 342    routines by the following three variables.  */
 343 /* Coding-system for reading files and receiving data from process.  */
 344 Lisp_Object Vcoding_system_for_read;
 345 /* Coding-system for writing files and sending data to process.  */
 346 Lisp_Object Vcoding_system_for_write;
 347 /* Coding-system actually used in the latest I/O.  */
 348 Lisp_Object Vlast_coding_system_used;
 349
 350 /* A vector of length 256 which contains information about special
 351    Latin codes (especially for dealing with Microsoft codes).  */
 352 Lisp_Object Vlatin_extra_code_table;
 353
 354 /* Flag to inhibit code conversion of end-of-line format.  */
 355 int inhibit_eol_conversion;
 356
 357 /* Flag to inhibit ISO2022 escape sequence detection.  */
 358 int inhibit_iso_escape_detection;
 359
 360 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 361 int inherit_process_coding_system;
 362
 363 /* Coding system to be used to encode text for terminal display.  */
 364 struct coding_system terminal_coding;
 365
 366 /* Coding system to be used to encode text for terminal display when
 367    terminal coding system is nil.  */
 368 struct coding_system safe_terminal_coding;
 369
 370 /* Coding system of what is sent from terminal keyboard.  */
 371 struct coding_system keyboard_coding;
 372
 373 Lisp_Object Vfile_coding_system_alist;
 374 Lisp_Object Vprocess_coding_system_alist;
 375 Lisp_Object Vnetwork_coding_system_alist;
 376
 377 Lisp_Object Vlocale_coding_system;
 378
 379 #endif /* emacs */
 380
 381 /* Flag to tell if we look up translation table on character code
 382    conversion.  */
 383 Lisp_Object Venable_character_translation;
 384 /* Standard translation table to look up on decoding (reading).  */
 385 Lisp_Object Vstandard_translation_table_for_decode;
 386 /* Standard translation table to look up on encoding (writing).  */
 387 Lisp_Object Vstandard_translation_table_for_encode;
 388
 389 Lisp_Object Qtranslation_table;
 390 Lisp_Object Qtranslation_table_id;
 391 Lisp_Object Qtranslation_table_for_decode;
 392 Lisp_Object Qtranslation_table_for_encode;
 393
 394 /* Alist of charsets vs revision number.  */
 395 static Lisp_Object Vcharset_revision_table;
 396
 397 /* Default coding systems used for process I/O.  */
 398 Lisp_Object Vdefault_process_coding_system;
 399
 400 /* Global flag to tell that we can't call post-read-conversion and
 401    pre-write-conversion functions.  Usually the value is zero, but it
 402    is set to 1 temporarily while such functions are running.  This is
 403    to avoid infinite recursive call.  */
 404 static int inhibit_pre_post_conversion;
 405
 406 /* Two special coding systems.  */
 407 Lisp_Object Vsjis_coding_system;
 408 Lisp_Object Vbig5_coding_system;
 409
 410
 411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
 412 static void decode_coding_utf_8 P_ ((struct coding_system *));
 413 static int encode_coding_utf_8 P_ ((struct coding_system *));
 414
 415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
 418
 419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
 420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 422
 423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
 424 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 425 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 426
 427 static int detect_coding_sjis P_ ((struct coding_system *, int *));
 428 static void decode_coding_sjis P_ ((struct coding_system *));
 429 static int encode_coding_sjis P_ ((struct coding_system *));
 430
 431 static int detect_coding_big5 P_ ((struct coding_system *, int *));
 432 static void decode_coding_big5 P_ ((struct coding_system *));
 433 static int encode_coding_big5 P_ ((struct coding_system *));
 434
 435 static int detect_coding_ccl P_ ((struct coding_system *, int *));
 436 static void decode_coding_ccl P_ ((struct coding_system *));
 437 static int encode_coding_ccl P_ ((struct coding_system *));
 438
 439 static void decode_coding_raw_text P_ ((struct coding_system *));
 440 static int encode_coding_raw_text P_ ((struct coding_system *));
 441
 442
 443 /* ISO2022 section */
 444
 445 #define CODING_ISO_INITIAL(coding, reg)                 \
 446   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 447                      coding_attr_iso_initial),          \
 448                reg)))
 449
 450
 451 #define CODING_ISO_REQUEST(coding, charset_id)  \
 452   ((charset_id <= (coding)->max_charset_id      \
 453     ? (coding)->safe_charsets[charset_id]       \
 454     : -1))
 455
 456
 457 #define CODING_ISO_FLAGS(coding)        \
 458   ((coding)->spec.iso_2022.flags)
 459 #define CODING_ISO_DESIGNATION(coding, reg)     \
 460   ((coding)->spec.iso_2022.current_designation[reg])
 461 #define CODING_ISO_INVOCATION(coding, plane)    \
 462   ((coding)->spec.iso_2022.current_invocation[plane])
 463 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 464   ((coding)->spec.iso_2022.single_shifting)
 465 #define CODING_ISO_BOL(coding)  \
 466   ((coding)->spec.iso_2022.bol)
 467 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 468   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 469
 470 /* Control characters of ISO2022.  */
 471                         /* code */      /* function */
 472 #define ISO_CODE_LF     0x0A            /* line-feed */
 473 #define ISO_CODE_CR     0x0D            /* carriage-return */
 474 #define ISO_CODE_SO     0x0E            /* shift-out */
 475 #define ISO_CODE_SI     0x0F            /* shift-in */
 476 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 477 #define ISO_CODE_ESC    0x1B            /* escape */
 478 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 479 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 480 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 481
 482 /* All code (1-byte) of ISO2022 is classified into one of the
 483    followings.  */
 484 enum iso_code_class_type
 485   {
 486     ISO_control_0,              /* Control codes in the range
 487                                    0x00..0x1F and 0x7F, except for the
 488                                    following 5 codes.  */
 489     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 490     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 491     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 492     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 493     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 494     ISO_control_1,              /* Control codes in the range
 495                                    0x80..0x9F, except for the
 496                                    following 3 codes.  */
 497     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 498     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 499     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 500     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 501     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 502     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 503     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 504   };
 505
 506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 507     `iso-flags' attribute of an iso2022 coding system.  */
 508
 509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 510    instead of the correct short-form sequence (e.g. ESC $ A).  */
 511 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 512
 513 /* If set, reset graphic planes and registers at end-of-line to the
 514    initial state.  */
 515 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 516
 517 /* If set, reset graphic planes and registers before any control
 518    characters to the initial state.  */
 519 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 520
 521 /* If set, encode by 7-bit environment.  */
 522 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 523
 524 /* If set, use locking-shift function.  */
 525 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 526
 527 /* If set, use single-shift function.  Overwrite
 528    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 529 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 530
 531 /* If set, use designation escape sequence.  */
 532 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 533
 534 /* If set, produce revision number sequence.  */
 535 #define CODING_ISO_FLAG_REVISION        0x0080
 536
 537 /* If set, produce ISO6429's direction specifying sequence.  */
 538 #define CODING_ISO_FLAG_DIRECTION       0x0100
 539
 540 /* If set, assume designation states are reset at beginning of line on
 541    output.  */
 542 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 543
 544 /* If set, designation sequence should be placed at beginning of line
 545    on output.  */
 546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 547
 548 /* If set, do not encode unsafe charactes on output.  */
 549 #define CODING_ISO_FLAG_SAFE            0x0800
 550
 551 /* If set, extra latin codes (128..159) are accepted as a valid code
 552    on input.  */
 553 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 554
 555 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 556
 557 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 558
 559 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 560
 561 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 562
 563 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 564
 565 /* A character to be produced on output if encoding of the original
 566    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 568
 569
 570 /* UTF-16 section */
 571 #define CODING_UTF_16_BOM(coding)       \
 572   ((coding)->spec.utf_16.bom)
 573
 574 #define CODING_UTF_16_ENDIAN(coding)    \
 575   ((coding)->spec.utf_16.endian)
 576
 577 #define CODING_UTF_16_SURROGATE(coding) \
 578   ((coding)->spec.utf_16.surrogate)
 579
 580
 581 /* CCL section */
 582 #define CODING_CCL_DECODER(coding)      \
 583   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 584 #define CODING_CCL_ENCODER(coding)      \
 585   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 586 #define CODING_CCL_VALIDS(coding)                                          \
 587   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 588    ->data)
 589
 590 /* Index for each coding category in `coding_category_table' */
 591
 592 enum coding_category
 593   {
 594     coding_category_iso_7,
 595     coding_category_iso_7_tight,
 596     coding_category_iso_8_1,
 597     coding_category_iso_8_2,
 598     coding_category_iso_7_else,
 599     coding_category_iso_8_else,
 600     coding_category_utf_8,
 601     coding_category_utf_16_auto,
 602     coding_category_utf_16_be,
 603     coding_category_utf_16_le,
 604     coding_category_utf_16_be_nosig,
 605     coding_category_utf_16_le_nosig,
 606     coding_category_charset,
 607     coding_category_sjis,
 608     coding_category_big5,
 609     coding_category_ccl,
 610     coding_category_emacs_mule,
 611     /* All above are targets of code detection.  */
 612     coding_category_raw_text,
 613     coding_category_undecided,
 614     coding_category_max
 615   };
 616
 617 /* Definitions of flag bits used in detect_coding_XXXX.  */
 618 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 619 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 620 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 621 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 622 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 623 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 624 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 625 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 626 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 627 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 628 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 629 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 630 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 631 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 632 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 633 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8                \
 645    | CATEGORY_MASK_UTF_16_BE            \
 646    | CATEGORY_MASK_UTF_16_LE            \
 647    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 648    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 649    | CATEGORY_MASK_CHARSET              \
 650    | CATEGORY_MASK_SJIS                 \
 651    | CATEGORY_MASK_BIG5                 \
 652    | CATEGORY_MASK_CCL                  \
 653    | CATEGORY_MASK_EMACS_MULE)
 654
 655
 656 #define CATEGORY_MASK_ISO_7BIT \
 657   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 658
 659 #define CATEGORY_MASK_ISO_8BIT \
 660   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 661
 662 #define CATEGORY_MASK_ISO_ELSE \
 663   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 664
 665 #define CATEGORY_MASK_ISO_ESCAPE        \
 666   (CATEGORY_MASK_ISO_7                  \
 667    | CATEGORY_MASK_ISO_7_TIGHT          \
 668    | CATEGORY_MASK_ISO_7_ELSE           \
 669    | CATEGORY_MASK_ISO_8_ELSE)
 670
 671 #define CATEGORY_MASK_ISO       \
 672   (  CATEGORY_MASK_ISO_7BIT     \
 673      | CATEGORY_MASK_ISO_8BIT   \
 674      | CATEGORY_MASK_ISO_ELSE)
 675
 676 #define CATEGORY_MASK_UTF_16            \
 677   (CATEGORY_MASK_UTF_16_BE              \
 678    | CATEGORY_MASK_UTF_16_LE            \
 679    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 680    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 681
 682
 683 /* List of symbols `coding-category-xxx' ordered by priority.  This
 684    variable is exposed to Emacs Lisp.  */
 685 static Lisp_Object Vcoding_category_list;
 686
 687 /* Table of coding categories (Lisp symbols).  This variable is for
 688    internal use oly.  */
 689 static Lisp_Object Vcoding_category_table;
 690
 691 /* Table of coding-categories ordered by priority.  */
 692 static enum coding_category coding_priorities[coding_category_max];
 693
 694 /* Nth element is a coding context for the coding system bound to the
 695    Nth coding category.  */
 696 static struct coding_system coding_categories[coding_category_max];
 697
 698 static int detected_mask[coding_category_raw_text] =
 699   { CATEGORY_MASK_ISO,
 700     CATEGORY_MASK_ISO,
 701     CATEGORY_MASK_ISO,
 702     CATEGORY_MASK_ISO,
 703     CATEGORY_MASK_ISO,
 704     CATEGORY_MASK_ISO,
 705     CATEGORY_MASK_UTF_8,
 706     CATEGORY_MASK_UTF_16,
 707     CATEGORY_MASK_UTF_16,
 708     CATEGORY_MASK_UTF_16,
 709     CATEGORY_MASK_UTF_16,
 710     CATEGORY_MASK_UTF_16,
 711     CATEGORY_MASK_CHARSET,
 712     CATEGORY_MASK_SJIS,
 713     CATEGORY_MASK_BIG5,
 714     CATEGORY_MASK_CCL,
 715     CATEGORY_MASK_EMACS_MULE
 716   };
 717
 718 /*** Commonly used macros and functions ***/
 719
 720 #ifndef min
 721 #define min(a, b) ((a) < (b) ? (a) : (b))
 722 #endif
 723 #ifndef max
 724 #define max(a, b) ((a) > (b) ? (a) : (b))
 725 #endif
 726
 727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 728   do {                                                          \
 729     attrs = CODING_ID_ATTRS (coding->id);                       \
 730     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 731     if (VECTORP (eol_type))                                     \
 732       eol_type = Qunix;                                         \
 733     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 734   } while (0)
 735
 736
 737 /* Safely get one byte from the source text pointed by SRC which ends
 738    at SRC_END, and set C to that byte.  If there are not enough bytes
 739    in the source, it jumps to `no_more_source'.  The caller
 740    should declare and set these variables appropriately in advance:
 741         src, src_end, multibytep
 742 */
 743
 744 #define ONE_MORE_BYTE(c)                                        \
 745   do {                                                          \
 746     if (src == src_end)                                         \
 747       {                                                         \
 748         if (src_base < src)                                     \
 749           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 750         goto no_more_source;                                    \
 751       }                                                         \
 752     c = *src++;                                                 \
 753     if (multibytep && (c & 0x80))                               \
 754       {                                                         \
 755         if ((c & 0xFE) != 0xC0)                                 \
 756           error ("Undecodable char found");                     \
 757         c = ((c & 1) << 6) | *src++;                            \
 758       }                                                         \
 759     consumed_chars++;                                           \
 760   } while (0)
 761
 762
 763 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 764   do {                                          \
 765     c = *src++;                                 \
 766     if (multibytep && (c & 0x80))               \
 767       {                                         \
 768         if ((c & 0xFE) != 0xC0)                 \
 769           error ("Undecodable char found");     \
 770         c = ((c & 1) << 6) | *src++;            \
 771       }                                         \
 772     consumed_chars++;                           \
 773   } while (0)
 774
 775
 776 /* Store a byte C in the place pointed by DST and increment DST to the
 777    next free point, and increment PRODUCED_CHARS.  The caller should
 778    assure that C is 0..127, and declare and set the variable `dst'
 779    appropriately in advance.
 780 */
 781
 782
 783 #define EMIT_ONE_ASCII_BYTE(c)  \
 784   do {                          \
 785     produced_chars++;           \
 786     *dst++ = (c);               \
 787   } while (0)
 788
 789
 790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 791
 792 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 793   do {                                  \
 794     produced_chars += 2;                \
 795     *dst++ = (c1), *dst++ = (c2);       \
 796   } while (0)
 797
 798
 799 /* Store a byte C in the place pointed by DST and increment DST to the
 800    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 801    nonzero, store in an appropriate multibyte from.  The caller should
 802    declare and set the variables `dst' and `multibytep' appropriately
 803    in advance.  */
 804
 805 #define EMIT_ONE_BYTE(c)                \
 806   do {                                  \
 807     produced_chars++;                   \
 808     if (multibytep)                     \
 809       {                                 \
 810         int ch = (c);                   \
 811         if (ch >= 0x80)                 \
 812           ch = BYTE8_TO_CHAR (ch);      \
 813         CHAR_STRING_ADVANCE (ch, dst);  \
 814       }                                 \
 815     else                                \
 816       *dst++ = (c);                     \
 817   } while (0)
 818
 819
 820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 821
 822 #define EMIT_TWO_BYTES(c1, c2)          \
 823   do {                                  \
 824     produced_chars += 2;                \
 825     if (multibytep)                     \
 826       {                                 \
 827         int ch;                         \
 828                                         \
 829         ch = (c1);                      \
 830         if (ch >= 0x80)                 \
 831           ch = BYTE8_TO_CHAR (ch);      \
 832         CHAR_STRING_ADVANCE (ch, dst);  \
 833         ch = (c2);                      \
 834         if (ch >= 0x80)                 \
 835           ch = BYTE8_TO_CHAR (ch);      \
 836         CHAR_STRING_ADVANCE (ch, dst);  \
 837       }                                 \
 838     else                                \
 839       {                                 \
 840         *dst++ = (c1);                  \
 841         *dst++ = (c2);                  \
 842       }                                 \
 843   } while (0)
 844
 845
 846 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 847   do {                                  \
 848     EMIT_ONE_BYTE (c1);                 \
 849     EMIT_TWO_BYTES (c2, c3);            \
 850   } while (0)
 851
 852
 853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 854   do {                                          \
 855     EMIT_TWO_BYTES (c1, c2);                    \
 856     EMIT_TWO_BYTES (c3, c4);                    \
 857   } while (0)
 858
 859
 860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 861   do {                                                                       \
 862     charset_map_loaded = 0;                                                  \
 863     c = DECODE_CHAR (charset, code);                                         \
 864     if (charset_map_loaded)                                                  \
 865       {                                                                      \
 866         unsigned char *orig = coding->source;                                \
 867         EMACS_INT offset;                                                    \
 868                                                                              \
 869         coding_set_source (coding);                                          \
 870         offset = coding->source - orig;                                      \
 871         src += offset;                                                       \
 872         src_base += offset;                                                  \
 873         src_end += offset;                                                   \
 874       }                                                                      \
 875   } while (0)
 876
 877
 878 #define ASSURE_DESTINATION(bytes)                               \
 879   do {                                                          \
 880     if (dst + (bytes) >= dst_end)                               \
 881       {                                                         \
 882         int more_bytes = charbuf_end - charbuf + (bytes);       \
 883                                                                 \
 884         dst = alloc_destination (coding, more_bytes, dst);      \
 885         dst_end = coding->destination + coding->dst_bytes;      \
 886       }                                                         \
 887   } while (0)
 888
 889
 890
 891 static void
 892 coding_set_source (coding)
 893      struct coding_system *coding;
 894 {
 895   if (BUFFERP (coding->src_object))
 896     {
 897       if (coding->src_pos < 0)
 898         coding->source = GAP_END_ADDR + coding->src_pos_byte;
 899       else
 900         {
 901           struct buffer *buf = XBUFFER (coding->src_object);
 902           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
 903           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
 904
 905           coding->source = beg_addr + coding->src_pos_byte - 1;
 906           if (coding->src_pos_byte >= gpt_byte)
 907             coding->source += BUF_GAP_SIZE (buf);
 908         }
 909     }
 910   else if (STRINGP (coding->src_object))
 911     {
 912       coding->source = (XSTRING (coding->src_object)->data
 913                         + coding->src_pos_byte);
 914     }
 915   else
 916     /* Otherwise, the source is C string and is never relocated
 917        automatically.  Thus we don't have to update anything.  */
 918     ;
 919 }
 920
 921 static void
 922 coding_set_destination (coding)
 923      struct coding_system *coding;
 924 {
 925   if (BUFFERP (coding->dst_object))
 926     {
 927       /* We are sure that coding->dst_pos_byte is before the gap of the
 928          buffer. */
 929       coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 930                              + coding->dst_pos_byte - 1);
 931       if (coding->src_pos < 0)
 932         coding->dst_bytes = (GAP_END_ADDR
 933                              - (coding->src_bytes - coding->consumed)
 934                              - coding->destination);
 935       else
 936         coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 937                              - coding->destination);
 938     }
 939   else
 940     /* Otherwise, the destination is C string and is never relocated
 941        automatically.  Thus we don't have to update anything.  */
 942     ;
 943 }
 944
 945
 946 static void
 947 coding_alloc_by_realloc (coding, bytes)
 948      struct coding_system *coding;
 949      EMACS_INT bytes;
 950 {
 951   coding->destination = (unsigned char *) xrealloc (coding->destination,
 952                                                     coding->dst_bytes + bytes);
 953   coding->dst_bytes += bytes;
 954 }
 955
 956 static void
 957 coding_alloc_by_making_gap (coding, bytes)
 958      struct coding_system *coding;
 959      EMACS_INT bytes;
 960 {
 961   if (BUFFERP (coding->dst_object)
 962       && EQ (coding->src_object, coding->dst_object))
 963     {
 964       EMACS_INT add = coding->src_bytes - coding->consumed;
 965
 966       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 967       make_gap (bytes);
 968       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 969     }
 970   else
 971     {
 972       Lisp_Object this_buffer;
 973
 974       this_buffer = Fcurrent_buffer ();
 975       set_buffer_internal (XBUFFER (coding->dst_object));
 976       make_gap (bytes);
 977       set_buffer_internal (XBUFFER (this_buffer));
 978     }
 979 }
 980
 981
 982 static unsigned char *
 983 alloc_destination (coding, nbytes, dst)
 984      struct coding_system *coding;
 985      int nbytes;
 986      unsigned char *dst;
 987 {
 988   EMACS_INT offset = dst - coding->destination;
 989
 990   if (BUFFERP (coding->dst_object))
 991     coding_alloc_by_making_gap (coding, nbytes);
 992   else
 993     coding_alloc_by_realloc (coding, nbytes);
 994   coding->result = CODING_RESULT_SUCCESS;
 995   coding_set_destination (coding);
 996   dst = coding->destination + offset;
 997   return dst;
 998 }
 999
1000 \f
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1002
1003
1004
1005 \f
1006 /*** 3. UTF-8 ***/
1007
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009    Check if a text is encoded in UTF-8.  If it is, return
1010    CATEGORY_MASK_UTF_8, else return 0.  */
1011
1012 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1018
1019 static int
1020 detect_coding_utf_8 (coding, mask)
1021      struct coding_system *coding;
1022      int *mask;
1023 {
1024   unsigned char *src = coding->source, *src_base = src;
1025   unsigned char *src_end = coding->source + coding->src_bytes;
1026   int multibytep = coding->src_multibyte;
1027   int consumed_chars = 0;
1028   int found = 0;
1029
1030   /* A coding system of this category is always ASCII compatible.  */
1031   src += coding->head_ascii;
1032
1033   while (1)
1034     {
1035       int c, c1, c2, c3, c4;
1036
1037       ONE_MORE_BYTE (c);
1038       if (UTF_8_1_OCTET_P (c))
1039         continue;
1040       ONE_MORE_BYTE (c1);
1041       if (! UTF_8_EXTRA_OCTET_P (c1))
1042         break;
1043       if (UTF_8_2_OCTET_LEADING_P (c))
1044         {
1045           found++;
1046           continue;
1047         }
1048       ONE_MORE_BYTE (c2);
1049       if (! UTF_8_EXTRA_OCTET_P (c2))
1050         break;
1051       if (UTF_8_3_OCTET_LEADING_P (c))
1052         {
1053           found++;
1054           continue;
1055         }
1056       ONE_MORE_BYTE (c3);
1057       if (! UTF_8_EXTRA_OCTET_P (c3))
1058         break;
1059       if (UTF_8_4_OCTET_LEADING_P (c))
1060         {
1061           found++;
1062           continue;
1063         }
1064       ONE_MORE_BYTE (c4);
1065       if (! UTF_8_EXTRA_OCTET_P (c4))
1066         break;
1067       if (UTF_8_5_OCTET_LEADING_P (c))
1068         {
1069           found++;
1070           continue;
1071         }
1072       break;
1073     }
1074   *mask &= ~CATEGORY_MASK_UTF_8;
1075   return 0;
1076
1077  no_more_source:
1078   if (! found)
1079     return 0;
1080   *mask &= CATEGORY_MASK_UTF_8;
1081   return 1;
1082 }
1083
1084
1085 /* Fixme: deal with surrogates?  */
1086 static void
1087 decode_coding_utf_8 (coding)
1088      struct coding_system *coding;
1089 {
1090   unsigned char *src = coding->source + coding->consumed;
1091   unsigned char *src_end = coding->source + coding->src_bytes;
1092   unsigned char *src_base;
1093   int *charbuf = coding->charbuf;
1094   int *charbuf_end = charbuf + coding->charbuf_size;
1095   int consumed_chars = 0, consumed_chars_base;
1096   int multibytep = coding->src_multibyte;
1097   Lisp_Object attr, eol_type, charset_list;
1098
1099   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1100
1101   while (1)
1102     {
1103       int c, c1, c2, c3, c4, c5;
1104
1105       src_base = src;
1106       consumed_chars_base = consumed_chars;
1107
1108       if (charbuf >= charbuf_end)
1109         break;
1110
1111       ONE_MORE_BYTE (c1);
1112       if (UTF_8_1_OCTET_P(c1))
1113         {
1114           c = c1;
1115           if (c == '\r')
1116             {
1117               if (EQ (eol_type, Qdos))
1118                 {
1119                   if (src == src_end)
1120                     goto no_more_source;
1121                   if (*src == '\n')
1122                     ONE_MORE_BYTE (c);
1123                 }
1124               else if (EQ (eol_type, Qmac))
1125                 c = '\n';
1126             }
1127         }
1128       else
1129         {
1130           ONE_MORE_BYTE (c2);
1131           if (! UTF_8_EXTRA_OCTET_P (c2))
1132             goto invalid_code;
1133           if (UTF_8_2_OCTET_LEADING_P (c1))
1134             {
1135               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1136               /* Reject overlong sequences here and below.  Encoders
1137                  producing them are incorrect, they can be misleading,
1138                  and they mess up read/write invariance.  */
1139               if (c < 128)
1140                 goto invalid_code;
1141             }
1142           else
1143             {
1144               ONE_MORE_BYTE (c3);
1145               if (! UTF_8_EXTRA_OCTET_P (c3))
1146                 goto invalid_code;
1147               if (UTF_8_3_OCTET_LEADING_P (c1))
1148                 {
1149                   c = (((c1 & 0xF) << 12)
1150                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1151                   if (c < 0x800)
1152                     goto invalid_code;
1153                 }
1154               else
1155                 {
1156                   ONE_MORE_BYTE (c4);
1157                   if (! UTF_8_EXTRA_OCTET_P (c4))
1158                     goto invalid_code;
1159                   if (UTF_8_4_OCTET_LEADING_P (c1))
1160                     {
1161                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1162                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1163                     if (c < 0x10000)
1164                       goto invalid_code;
1165                     }
1166                   else
1167                     {
1168                       ONE_MORE_BYTE (c5);
1169                       if (! UTF_8_EXTRA_OCTET_P (c5))
1170                         goto invalid_code;
1171                       if (UTF_8_5_OCTET_LEADING_P (c1))
1172                         {
1173                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1174                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1175                                | (c5 & 0x3F));
1176                           if ((c > MAX_CHAR) || (c < 0x200000))
1177                             goto invalid_code;
1178                         }
1179                       else
1180                         goto invalid_code;
1181                     }
1182                 }
1183             }
1184         }
1185
1186       *charbuf++ = c;
1187       continue;
1188
1189     invalid_code:
1190       src = src_base;
1191       consumed_chars = consumed_chars_base;
1192       ONE_MORE_BYTE (c);
1193       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1194       coding->errors++;
1195     }
1196
1197  no_more_source:
1198   coding->consumed_char += consumed_chars_base;
1199   coding->consumed = src_base - coding->source;
1200   coding->charbuf_used = charbuf - coding->charbuf;
1201 }
1202
1203
1204 static int
1205 encode_coding_utf_8 (coding)
1206      struct coding_system *coding;
1207 {
1208   int multibytep = coding->dst_multibyte;
1209   int *charbuf = coding->charbuf;
1210   int *charbuf_end = charbuf + coding->charbuf_used;
1211   unsigned char *dst = coding->destination + coding->produced;
1212   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1213   int produced_chars = 0;
1214   int c;
1215
1216   if (multibytep)
1217     {
1218       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1219
1220       while (charbuf < charbuf_end)
1221         {
1222           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1223
1224           ASSURE_DESTINATION (safe_room);
1225           c = *charbuf++;
1226           CHAR_STRING_ADVANCE (c, pend);
1227           for (p = str; p < pend; p++)
1228             EMIT_ONE_BYTE (*p);
1229         }
1230     }
1231   else
1232     {
1233       int safe_room = MAX_MULTIBYTE_LENGTH;
1234
1235       while (charbuf < charbuf_end)
1236         {
1237           ASSURE_DESTINATION (safe_room);
1238           c = *charbuf++;
1239           dst += CHAR_STRING (c, dst);
1240           produced_chars++;
1241         }
1242     }
1243   coding->result = CODING_RESULT_SUCCESS;
1244   coding->produced_char += produced_chars;
1245   coding->produced = dst - coding->destination;
1246   return 0;
1247 }
1248
1249
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252    Little Endian (otherwise).  If it is, return
1253    CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1254    else return 0.  */
1255
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257   (((val) & 0xFC00) == 0xD800)
1258
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260   (((val) & 0xFC00) == 0xDC00)
1261
1262 #define UTF_16_INVALID_P(val)   \
1263   (((val) == 0xFFFE)            \
1264    || ((val) == 0xFFFF)         \
1265    || UTF_16_LOW_SURROGATE_P (val))
1266
1267
1268 static int
1269 detect_coding_utf_16 (coding, mask)
1270      struct coding_system *coding;
1271      int *mask;
1272 {
1273   unsigned char *src = coding->source, *src_base = src;
1274   unsigned char *src_end = coding->source + coding->src_bytes;
1275   int multibytep = coding->src_multibyte;
1276   int consumed_chars = 0;
1277   int c1, c2;
1278
1279   ONE_MORE_BYTE (c1);
1280   ONE_MORE_BYTE (c2);
1281
1282   if ((c1 == 0xFF) && (c2 == 0xFE))
1283     {
1284       *mask &= CATEGORY_MASK_UTF_16_LE;
1285       return 1;
1286     }
1287   else if ((c1 == 0xFE) && (c2 == 0xFF))
1288     {
1289       *mask &= CATEGORY_MASK_UTF_16_BE;
1290       return 1;
1291     }
1292  no_more_source:
1293   return 0;
1294 }
1295
1296 static void
1297 decode_coding_utf_16 (coding)
1298      struct coding_system *coding;
1299 {
1300   unsigned char *src = coding->source + coding->consumed;
1301   unsigned char *src_end = coding->source + coding->src_bytes;
1302   unsigned char *src_base;
1303   int *charbuf = coding->charbuf;
1304   int *charbuf_end = charbuf + coding->charbuf_size;
1305   int consumed_chars = 0, consumed_chars_base;
1306   int multibytep = coding->src_multibyte;
1307   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1308   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1309   int surrogate = CODING_UTF_16_SURROGATE (coding);
1310   Lisp_Object attr, eol_type, charset_list;
1311
1312   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1313
1314   if (bom != utf_16_without_bom)
1315     {
1316       int c, c1, c2;
1317
1318       src_base = src;
1319       ONE_MORE_BYTE (c1);
1320       ONE_MORE_BYTE (c2);
1321       c = (c1 << 8) | c2;
1322       if (bom == utf_16_with_bom)
1323         {
1324           if (endian == utf_16_big_endian
1325               ? c != 0xFFFE : c != 0xFEFF)
1326             {
1327               /* We are sure that there's enouph room at CHARBUF.  */
1328               *charbuf++ = c1;
1329               *charbuf++ = c2;
1330               coding->errors++;
1331             }
1332         }
1333       else
1334         {
1335           if (c == 0xFFFE)
1336             CODING_UTF_16_ENDIAN (coding)
1337               = endian = utf_16_big_endian;
1338           else if (c == 0xFEFF)
1339             CODING_UTF_16_ENDIAN (coding)
1340               = endian = utf_16_little_endian;
1341           else
1342             {
1343               CODING_UTF_16_ENDIAN (coding)
1344                 = endian = utf_16_big_endian;
1345               src = src_base;
1346             }
1347         }
1348       CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1349     }
1350
1351   while (1)
1352     {
1353       int c, c1, c2;
1354
1355       src_base = src;
1356       consumed_chars_base = consumed_chars;
1357
1358       if (charbuf + 2 >= charbuf_end)
1359         break;
1360
1361       ONE_MORE_BYTE (c1);
1362       ONE_MORE_BYTE (c2);
1363       c = (endian == utf_16_big_endian
1364            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1365       if (surrogate)
1366         {
1367           if (! UTF_16_LOW_SURROGATE_P (c))
1368             {
1369               if (endian == utf_16_big_endian)
1370                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1371               else
1372                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1373               *charbuf++ = c1;
1374               *charbuf++ = c2;
1375               coding->errors++;
1376               if (UTF_16_HIGH_SURROGATE_P (c))
1377                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1378               else
1379                 *charbuf++ = c;
1380             }
1381           else
1382             {
1383               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1384               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1385               *charbuf++ = c;
1386             }
1387         }
1388       else
1389         {
1390           if (UTF_16_HIGH_SURROGATE_P (c))
1391             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1392           else
1393             *charbuf++ = c;
1394         }
1395     }
1396
1397  no_more_source:
1398   coding->consumed_char += consumed_chars_base;
1399   coding->consumed = src_base - coding->source;
1400   coding->charbuf_used = charbuf - coding->charbuf;
1401 }
1402
1403 static int
1404 encode_coding_utf_16 (coding)
1405      struct coding_system *coding;
1406 {
1407   int multibytep = coding->dst_multibyte;
1408   int *charbuf = coding->charbuf;
1409   int *charbuf_end = charbuf + coding->charbuf_used;
1410   unsigned char *dst = coding->destination + coding->produced;
1411   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1412   int safe_room = 8;
1413   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1414   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1415   int produced_chars = 0;
1416   Lisp_Object attrs, eol_type, charset_list;
1417   int c;
1418
1419   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1420
1421   if (bom == utf_16_with_bom)
1422     {
1423       ASSURE_DESTINATION (safe_room);
1424       if (big_endian)
1425         EMIT_TWO_BYTES (0xFF, 0xFE);
1426       else
1427         EMIT_TWO_BYTES (0xFE, 0xFF);
1428       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1429     }
1430
1431   while (charbuf < charbuf_end)
1432     {
1433       ASSURE_DESTINATION (safe_room);
1434       c = *charbuf++;
1435       if (c >= MAX_UNICODE_CHAR)
1436         c = coding->default_char;
1437
1438       if (c < 0x10000)
1439         {
1440           if (big_endian)
1441             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1442           else
1443             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1444         }
1445       else
1446         {
1447           int c1, c2;
1448
1449           c -= 0x10000;
1450           c1 = (c >> 10) + 0xD800;
1451           c2 = (c & 0x3FF) + 0xDC00;
1452           if (big_endian)
1453             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1454           else
1455             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1456         }
1457     }
1458   coding->result = CODING_RESULT_SUCCESS;
1459   coding->produced = dst - coding->destination;
1460   coding->produced_char += produced_chars;
1461   return 0;
1462 }
1463
1464 \f
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1466
1467 /* Emacs' internal format for representation of multiple character
1468    sets is a kind of multi-byte encoding, i.e. characters are
1469    represented by variable-length sequences of one-byte codes.
1470
1471    ASCII characters and control characters (e.g. `tab', `newline') are
1472    represented by one-byte sequences which are their ASCII codes, in
1473    the range 0x00 through 0x7F.
1474
1475    8-bit characters of the range 0x80..0x9F are represented by
1476    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1477    code + 0x20).
1478
1479    8-bit characters of the range 0xA0..0xFF are represented by
1480    one-byte sequences which are their 8-bit code.
1481
1482    The other characters are represented by a sequence of `base
1483    leading-code', optional `extended leading-code', and one or two
1484    `position-code's.  The length of the sequence is determined by the
1485    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1486    whereas extended leading-code and position-code take the range 0xA0
1487    through 0xFF.  See `charset.h' for more details about leading-code
1488    and position-code.
1489
1490    --- CODE RANGE of Emacs' internal format ---
1491    character set        range
1492    -------------        -----
1493    ascii                0x00..0x7F
1494    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495    eight-bit-graphic    0xA0..0xBF
1496    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1497    ---------------------------------------------
1498
1499    As this is the internal character representation, the format is
1500    usually not used externally (i.e. in a file or in a data sent to a
1501    process).  But, it is possible to have a text externally in this
1502    format (i.e. by encoding by the coding system `emacs-mule').
1503
1504    In that case, a sequence of one-byte codes has a slightly different
1505    form.
1506
1507    At first, all characters in eight-bit-control are represented by
1508    one-byte sequences which are their 8-bit code.
1509
1510    Next, character composition data are represented by the byte
1511    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1512    where,
1513         METHOD is 0xF0 plus one of composition method (enum
1514         composition_method),
1515
1516         BYTES is 0xA0 plus a byte length of this composition data,
1517
1518         CHARS is 0x20 plus a number of characters composed by this
1519         data,
1520
1521         COMPONENTs are characters of multibye form or composition
1522         rules encoded by two-byte of ASCII codes.
1523
1524    In addition, for backward compatibility, the following formats are
1525    also recognized as composition data on decoding.
1526
1527    0x80 MSEQ ...
1528    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1529
1530    Here,
1531         MSEQ is a multibyte form but in these special format:
1532           ASCII: 0xA0 ASCII_CODE+0x80,
1533           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534         RULE is a one byte code of the range 0xA0..0xF0 that
1535         represents a composition rule.
1536   */
1537
1538 char emacs_mule_bytes[256];
1539
1540 /* Leading-code followed by extended leading-code.  */
1541 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1542 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1543 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1544 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1545
1546
1547 int
1548 emacs_mule_char (coding, src, nbytes, nchars)
1549      struct coding_system *coding;
1550      unsigned char *src;
1551      int *nbytes, *nchars;
1552 {
1553   unsigned char *src_end = coding->source + coding->src_bytes;
1554   int multibytep = coding->src_multibyte;
1555   unsigned char *src_base = src;
1556   struct charset *charset;
1557   unsigned code;
1558   int c;
1559   int consumed_chars = 0;
1560
1561   ONE_MORE_BYTE (c);
1562   switch (emacs_mule_bytes[c])
1563     {
1564     case 2:
1565       if (! (charset = emacs_mule_charset[c]))
1566         goto invalid_code;
1567       ONE_MORE_BYTE (c);
1568       code = c & 0x7F;
1569       break;
1570
1571     case 3:
1572       if (c == LEADING_CODE_PRIVATE_11
1573           || c == LEADING_CODE_PRIVATE_12)
1574         {
1575           ONE_MORE_BYTE (c);
1576           if (! (charset = emacs_mule_charset[c]))
1577             goto invalid_code;
1578           ONE_MORE_BYTE (c);
1579           code = c & 0x7F;
1580         }
1581       else
1582         {
1583           if (! (charset = emacs_mule_charset[c]))
1584             goto invalid_code;
1585           ONE_MORE_BYTE (c);
1586           code = (c & 0x7F) << 8;
1587           ONE_MORE_BYTE (c);
1588           code |= c & 0x7F;
1589         }
1590       break;
1591
1592     case 4:
1593       ONE_MORE_BYTE (c);
1594       if (! (charset = emacs_mule_charset[c]))
1595         goto invalid_code;
1596       ONE_MORE_BYTE (c);
1597       code = (c & 0x7F) << 8;
1598       ONE_MORE_BYTE (c);
1599       code |= c & 0x7F;
1600       break;
1601
1602     case 1:
1603       code = c;
1604       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1605                                  : code < 0xA0 ? charset_8_bit_control
1606                                  : charset_8_bit_graphic);
1607       break;
1608
1609     default:
1610       abort ();
1611     }
1612   c = DECODE_CHAR (charset, code);
1613   if (c < 0)
1614     goto invalid_code;
1615   *nbytes = src - src_base;
1616   *nchars = consumed_chars;
1617   return c;
1618
1619  no_more_source:
1620   return -2;
1621
1622  invalid_code:
1623   return -1;
1624 }
1625
1626
1627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1628    Check if a text is encoded in `emacs-mule'.  */
1629
1630 static int
1631 detect_coding_emacs_mule (coding, mask)
1632      struct coding_system *coding;
1633      int *mask;
1634 {
1635   unsigned char *src = coding->source, *src_base = src;
1636   unsigned char *src_end = coding->source + coding->src_bytes;
1637   int multibytep = coding->src_multibyte;
1638   int consumed_chars = 0;
1639   int c;
1640   int found = 0;
1641
1642   /* A coding system of this category is always ASCII compatible.  */
1643   src += coding->head_ascii;
1644
1645   while (1)
1646     {
1647       ONE_MORE_BYTE (c);
1648
1649       if (c == 0x80)
1650         {
1651           /* Perhaps the start of composite character.  We simple skip
1652              it because analyzing it is too heavy for detecting.  But,
1653              at least, we check that the composite character
1654              constitues of more than 4 bytes.  */
1655           unsigned char *src_base;
1656
1657         repeat:
1658           src_base = src;
1659           do
1660             {
1661               ONE_MORE_BYTE (c);
1662             }
1663           while (c >= 0xA0);
1664
1665           if (src - src_base <= 4)
1666             break;
1667           found = 1;
1668           if (c == 0x80)
1669             goto repeat;
1670         }
1671
1672       if (c < 0x80)
1673         {
1674           if (c < 0x20
1675               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1676             break;
1677         }
1678       else
1679         {
1680           unsigned char *src_base = src - 1;
1681
1682           do
1683             {
1684               ONE_MORE_BYTE (c);
1685             }
1686           while (c >= 0xA0);
1687           if (src - src_base != emacs_mule_bytes[*src_base])
1688             break;
1689           found = 1;
1690         }
1691     }
1692   *mask &= ~CATEGORY_MASK_EMACS_MULE;
1693   return 0;
1694
1695  no_more_source:
1696   if (!found)
1697     return 0;
1698   *mask &= CATEGORY_MASK_EMACS_MULE;
1699   return 1;
1700 }
1701
1702
1703 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1704
1705 /* Decode a character represented as a component of composition
1706    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1707    update SRC to the head of next character (or an encoded composition
1708    rule).  If SRC doesn't points a composition component, set C to -1.
1709    If SRC points an invalid byte sequence, global exit by a return
1710    value 0.  */
1711
1712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1713   if (1)                                                        \
1714     {                                                           \
1715       int c;                                                    \
1716       int nbytes, nchars;                                       \
1717                                                                 \
1718       if (src == src_end)                                       \
1719         break;                                                  \
1720       c = emacs_mule_char (coding, src, &nbytes, &nchars);      \
1721       if (c < 0)                                                \
1722         {                                                       \
1723           if (c == -2)                                          \
1724             break;                                              \
1725           goto invalid_code;                                    \
1726         }                                                       \
1727       *buf++ = c;                                               \
1728       src += nbytes;                                            \
1729       consumed_chars += nchars;                                 \
1730     }                                                           \
1731   else
1732
1733
1734 /* Decode a composition rule represented as a component of composition
1735    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1736    and increment BUF.  If SRC points an invalid byte sequence, set C
1737    to -1.  */
1738
1739 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1740   do {                                                  \
1741     int c, gref, nref;                                  \
1742                                                         \
1743     if (src >= src_end)                                 \
1744       goto invalid_code;                                \
1745     ONE_MORE_BYTE_NO_CHECK (c);                         \
1746     c -= 0x20;                                          \
1747     if (c < 0 || c >= 81)                               \
1748       goto invalid_code;                                \
1749                                                         \
1750     gref = c / 9, nref = c % 9;                         \
1751     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1752   } while (0)
1753
1754
1755 /* Decode a composition rule represented as a component of composition
1756    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1757    and increment BUF.  If SRC points an invalid byte sequence, set C
1758    to -1.  */
1759
1760 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1761   do {                                                  \
1762     int gref, nref;                                     \
1763                                                         \
1764     if (src + 1>= src_end)                              \
1765       goto invalid_code;                                \
1766     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1767     gref -= 0x20;                                       \
1768     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1769     nref -= 0x20;                                       \
1770     if (gref < 0 || gref >= 81                          \
1771         || nref < 0 || nref >= 81)                      \
1772       goto invalid_code;                                \
1773     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1774   } while (0)
1775
1776
1777 #define ADD_COMPOSITION_DATA(buf, method, nchars)       \
1778   do {                                                  \
1779     *buf++ = -5;                                        \
1780     *buf++ = coding->produced_char + char_offset;       \
1781     *buf++ = CODING_ANNOTATE_COMPOSITION_MASK;          \
1782     *buf++ = method;                                    \
1783     *buf++ = nchars;                                    \
1784   } while (0)
1785
1786
1787 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1788   do {                                                                  \
1789     /* Emacs 21 style format.  The first three bytes at SRC are         \
1790        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1791        the byte length of this composition information, CHARS is the    \
1792        number of characters composed by this composition.  */           \
1793     enum composition_method method = c - 0xF2;                          \
1794     int *charbuf_base = charbuf;                                        \
1795     int consumed_chars_limit;                                           \
1796     int nbytes, nchars;                                                 \
1797                                                                         \
1798     ONE_MORE_BYTE (c);                                                  \
1799     nbytes = c - 0xA0;                                                  \
1800     if (nbytes < 3)                                                     \
1801       goto invalid_code;                                                \
1802     ONE_MORE_BYTE (c);                                                  \
1803     nchars = c - 0xA0;                                                  \
1804     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
1805     consumed_chars_limit = consumed_chars_base + nbytes;                \
1806     if (method != COMPOSITION_RELATIVE)                                 \
1807       {                                                                 \
1808         int i = 0;                                                      \
1809         while (consumed_chars < consumed_chars_limit)                   \
1810           {                                                             \
1811             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1812               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1813             else                                                        \
1814               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1815             i++;                                                        \
1816           }                                                             \
1817         if (consumed_chars < consumed_chars_limit)                      \
1818           goto invalid_code;                                            \
1819         charbuf_base[0] -= i;                                           \
1820       }                                                                 \
1821   } while (0)
1822
1823
1824 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1825   do {                                                          \
1826     /* Emacs 20 style format for relative composition.  */      \
1827     /* Store multibyte form of characters to be composed.  */   \
1828     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1829     int *buf = components;                                      \
1830     int i, j;                                                   \
1831                                                                 \
1832     src = src_base;                                             \
1833     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1834     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1835       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1836     if (i < 2)                                                  \
1837       goto invalid_code;                                        \
1838     ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i);    \
1839     for (j = 0; j < i; j++)                                     \
1840       *charbuf++ = components[j];                               \
1841   } while (0)
1842
1843
1844 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1845   do {                                                          \
1846     /* Emacs 20 style format for rule-base composition.  */     \
1847     /* Store multibyte form of characters to be composed.  */   \
1848     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1849     int *buf = components;                                      \
1850     int i, j;                                                   \
1851                                                                 \
1852     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1853     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1854       {                                                         \
1855         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1856         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1857       }                                                         \
1858     if (i < 1 || (buf - components) % 2 == 0)                   \
1859       goto invalid_code;                                        \
1860     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1861       goto no_more_source;                                      \
1862     ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i);       \
1863     for (j = 0; j < i; j++)                                     \
1864       *charbuf++ = components[j];                               \
1865     for (j = 0; j < i; j += 2)                                  \
1866       *charbuf++ = components[j];                               \
1867   } while (0)
1868
1869
1870 static void
1871 decode_coding_emacs_mule (coding)
1872      struct coding_system *coding;
1873 {
1874   unsigned char *src = coding->source + coding->consumed;
1875   unsigned char *src_end = coding->source + coding->src_bytes;
1876   unsigned char *src_base;
1877   int *charbuf = coding->charbuf;
1878   int *charbuf_end = charbuf + coding->charbuf_size;
1879   int consumed_chars = 0, consumed_chars_base;
1880   int char_offset = 0;
1881   int multibytep = coding->src_multibyte;
1882   Lisp_Object attrs, eol_type, charset_list;
1883
1884   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1885
1886   while (1)
1887     {
1888       int c;
1889
1890       src_base = src;
1891       consumed_chars_base = consumed_chars;
1892
1893       if (charbuf >= charbuf_end)
1894         break;
1895
1896       ONE_MORE_BYTE (c);
1897
1898       if (c < 0x80)
1899         {
1900           if (c == '\r')
1901             {
1902               if (EQ (eol_type, Qdos))
1903                 {
1904                   if (src == src_end)
1905                     goto no_more_source;
1906                   if (*src == '\n')
1907                     ONE_MORE_BYTE (c);
1908                 }
1909               else if (EQ (eol_type, Qmac))
1910                 c = '\n';
1911             }
1912           *charbuf++ = c;
1913           char_offset++;
1914         }
1915       else if (c == 0x80)
1916         {
1917           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1918             break;
1919           ONE_MORE_BYTE (c);
1920           if (c - 0xF2 >= COMPOSITION_RELATIVE
1921               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1922             DECODE_EMACS_MULE_21_COMPOSITION (c);
1923           else if (c < 0xC0)
1924             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1925           else if (c == 0xFF)
1926             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1927           else
1928             goto invalid_code;
1929           coding->annotated = 1;
1930         }
1931       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1932         {
1933           int nbytes, nchars;
1934           src = src_base;
1935           consumed_chars = consumed_chars_base;
1936           c = emacs_mule_char (coding, src, &nbytes, &nchars);
1937           if (c < 0)
1938             {
1939               if (c == -2)
1940                 break;
1941               goto invalid_code;
1942             }
1943           *charbuf++ = c;
1944           src += nbytes;
1945           consumed_chars += nchars;
1946           char_offset++;
1947         }
1948       continue;
1949
1950     invalid_code:
1951       src = src_base;
1952       consumed_chars = consumed_chars_base;
1953       ONE_MORE_BYTE (c);
1954       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1955       coding->errors++;
1956     }
1957
1958  no_more_source:
1959   coding->consumed_char += consumed_chars_base;
1960   coding->consumed = src_base - coding->source;
1961   coding->charbuf_used = charbuf - coding->charbuf;
1962 }
1963
1964
1965 #define EMACS_MULE_LEADING_CODES(id, codes)     \
1966   do {                                          \
1967     if (id < 0xA0)                              \
1968       codes[0] = id, codes[1] = 0;              \
1969     else if (id < 0xE0)                         \
1970       codes[0] = 0x9A, codes[1] = id;           \
1971     else if (id < 0xF0)                         \
1972       codes[0] = 0x9B, codes[1] = id;           \
1973     else if (id < 0xF5)                         \
1974       codes[0] = 0x9C, codes[1] = id;           \
1975     else                                        \
1976       codes[0] = 0x9D, codes[1] = id;           \
1977   } while (0);
1978
1979
1980 static int
1981 encode_coding_emacs_mule (coding)
1982      struct coding_system *coding;
1983 {
1984   int multibytep = coding->dst_multibyte;
1985   int *charbuf = coding->charbuf;
1986   int *charbuf_end = charbuf + coding->charbuf_used;
1987   unsigned char *dst = coding->destination + coding->produced;
1988   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1989   int safe_room = 8;
1990   int produced_chars = 0;
1991   Lisp_Object attrs, eol_type, charset_list;
1992   int c;
1993
1994   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1995
1996   while (charbuf < charbuf_end)
1997     {
1998       ASSURE_DESTINATION (safe_room);
1999       c = *charbuf++;
2000       if (ASCII_CHAR_P (c))
2001         EMIT_ONE_ASCII_BYTE (c);
2002       else
2003         {
2004           struct charset *charset;
2005           unsigned code;
2006           int dimension;
2007           int emacs_mule_id;
2008           unsigned char leading_codes[2];
2009
2010           charset = char_charset (c, charset_list, &code);
2011           if (! charset)
2012             {
2013               c = coding->default_char;
2014               if (ASCII_CHAR_P (c))
2015                 {
2016                   EMIT_ONE_ASCII_BYTE (c);
2017                   continue;
2018                 }
2019               charset = char_charset (c, charset_list, &code);
2020             }
2021           dimension = CHARSET_DIMENSION (charset);
2022           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2023           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2024           EMIT_ONE_BYTE (leading_codes[0]);
2025           if (leading_codes[1])
2026             EMIT_ONE_BYTE (leading_codes[1]);
2027           if (dimension == 1)
2028             EMIT_ONE_BYTE (code);
2029           else
2030             {
2031               EMIT_ONE_BYTE (code >> 8);
2032               EMIT_ONE_BYTE (code & 0xFF);
2033             }
2034         }
2035     }
2036   coding->result = CODING_RESULT_SUCCESS;
2037   coding->produced_char += produced_chars;
2038   coding->produced = dst - coding->destination;
2039   return 0;
2040 }
2041
2042 \f
2043 /*** 7. ISO2022 handlers ***/
2044
2045 /* The following note describes the coding system ISO2022 briefly.
2046    Since the intention of this note is to help understand the
2047    functions in this file, some parts are NOT ACCURATE or OVERLY
2048    SIMPLIFIED.  For thorough understanding, please refer to the
2049    original document of ISO2022.
2050
2051    ISO2022 provides many mechanisms to encode several character sets
2052    in 7-bit and 8-bit environments.  For 7-bite environments, all text
2053    is encoded using bytes less than 128.  This may make the encoded
2054    text a little bit longer, but the text passes more easily through
2055    several gateways, some of which strip off MSB (Most Signigant Bit).
2056
2057    There are two kinds of character sets: control character set and
2058    graphic character set.  The former contains control characters such
2059    as `newline' and `escape' to provide control functions (control
2060    functions are also provided by escape sequences).  The latter
2061    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2062    two control character sets and many graphic character sets.
2063
2064    Graphic character sets are classified into one of the following
2065    four classes, according to the number of bytes (DIMENSION) and
2066    number of characters in one dimension (CHARS) of the set:
2067    - DIMENSION1_CHARS94
2068    - DIMENSION1_CHARS96
2069    - DIMENSION2_CHARS94
2070    - DIMENSION2_CHARS96
2071
2072    In addition, each character set is assigned an identification tag,
2073    unique for each set, called "final character" (denoted as <F>
2074    hereafter).  The <F> of each character set is decided by ECMA(*)
2075    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2076    (0x30..0x3F are for private use only).
2077
2078    Note (*): ECMA = European Computer Manufacturers Association
2079
2080    Here are examples of graphic character set [NAME(<F>)]:
2081         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2082         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2083         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2084         o DIMENSION2_CHARS96 -- none for the moment
2085
2086    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2087         C0 [0x00..0x1F] -- control character plane 0
2088         GL [0x20..0x7F] -- graphic character plane 0
2089         C1 [0x80..0x9F] -- control character plane 1
2090         GR [0xA0..0xFF] -- graphic character plane 1
2091
2092    A control character set is directly designated and invoked to C0 or
2093    C1 by an escape sequence.  The most common case is that:
2094    - ISO646's  control character set is designated/invoked to C0, and
2095    - ISO6429's control character set is designated/invoked to C1,
2096    and usually these designations/invocations are omitted in encoded
2097    text.  In a 7-bit environment, only C0 can be used, and a control
2098    character for C1 is encoded by an appropriate escape sequence to
2099    fit into the environment.  All control characters for C1 are
2100    defined to have corresponding escape sequences.
2101
2102    A graphic character set is at first designated to one of four
2103    graphic registers (G0 through G3), then these graphic registers are
2104    invoked to GL or GR.  These designations and invocations can be
2105    done independently.  The most common case is that G0 is invoked to
2106    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2107    these invocations and designations are omitted in encoded text.
2108    In a 7-bit environment, only GL can be used.
2109
2110    When a graphic character set of CHARS94 is invoked to GL, codes
2111    0x20 and 0x7F of the GL area work as control characters SPACE and
2112    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2113    be used.
2114
2115    There are two ways of invocation: locking-shift and single-shift.
2116    With locking-shift, the invocation lasts until the next different
2117    invocation, whereas with single-shift, the invocation affects the
2118    following character only and doesn't affect the locking-shift
2119    state.  Invocations are done by the following control characters or
2120    escape sequences:
2121
2122    ----------------------------------------------------------------------
2123    abbrev  function                  cntrl escape seq   description
2124    ----------------------------------------------------------------------
2125    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2126    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2127    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2128    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2129    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2130    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2131    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2132    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2133    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2134    ----------------------------------------------------------------------
2135    (*) These are not used by any known coding system.
2136
2137    Control characters for these functions are defined by macros
2138    ISO_CODE_XXX in `coding.h'.
2139
2140    Designations are done by the following escape sequences:
2141    ----------------------------------------------------------------------
2142    escape sequence      description
2143    ----------------------------------------------------------------------
2144    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2145    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2146    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2147    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2148    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2149    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2150    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2151    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2152    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2153    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2154    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2155    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2156    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2157    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2158    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2159    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2160    ----------------------------------------------------------------------
2161
2162    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2163    of dimension 1, chars 94, and final character <F>, etc...
2164
2165    Note (*): Although these designations are not allowed in ISO2022,
2166    Emacs accepts them on decoding, and produces them on encoding
2167    CHARS96 character sets in a coding system which is characterized as
2168    7-bit environment, non-locking-shift, and non-single-shift.
2169
2170    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2171    '(' must be omitted.  We refer to this as "short-form" hereafter.
2172
2173    Now you may notice that there are a lot of ways for encoding the
2174    same multilingual text in ISO2022.  Actually, there exist many
2175    coding systems such as Compound Text (used in X11's inter client
2176    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2177    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2178    localized platforms), and all of these are variants of ISO2022.
2179
2180    In addition to the above, Emacs handles two more kinds of escape
2181    sequences: ISO6429's direction specification and Emacs' private
2182    sequence for specifying character composition.
2183
2184    ISO6429's direction specification takes the following form:
2185         o CSI ']'      -- end of the current direction
2186         o CSI '0' ']'  -- end of the current direction
2187         o CSI '1' ']'  -- start of left-to-right text
2188         o CSI '2' ']'  -- start of right-to-left text
2189    The control character CSI (0x9B: control sequence introducer) is
2190    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2191
2192    Character composition specification takes the following form:
2193         o ESC '0' -- start relative composition
2194         o ESC '1' -- end composition
2195         o ESC '2' -- start rule-base composition (*)
2196         o ESC '3' -- start relative composition with alternate chars  (**)
2197         o ESC '4' -- start rule-base composition with alternate chars  (**)
2198   Since these are not standard escape sequences of any ISO standard,
2199   the use of them for these meaning is restricted to Emacs only.
2200
2201   (*) This form is used only in Emacs 20.5 and the older versions,
2202   but the newer versions can safely decode it.
2203   (**) This form is used only in Emacs 21.1 and the newer versions,
2204   and the older versions can't decode it.
2205
2206   Here's a list of examples usages of these composition escape
2207   sequences (categorized by `enum composition_method').
2208
2209   COMPOSITION_RELATIVE:
2210         ESC 0 CHAR [ CHAR ] ESC 1
2211   COMPOSITOIN_WITH_RULE:
2212         ESC 2 CHAR [ RULE CHAR ] ESC 1
2213   COMPOSITION_WITH_ALTCHARS:
2214         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2215   COMPOSITION_WITH_RULE_ALTCHARS:
2216         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2217
2218 enum iso_code_class_type iso_code_class[256];
2219
2220 #define SAFE_CHARSET_P(coding, id)      \
2221   ((id) <= (coding)->max_charset_id     \
2222    && (coding)->safe_charsets[id] >= 0)
2223
2224
2225 #define SHIFT_OUT_OK(category)  \
2226   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2227
2228 static void
2229 setup_iso_safe_charsets (attrs)
2230      Lisp_Object attrs;
2231 {
2232   Lisp_Object charset_list, safe_charsets;
2233   Lisp_Object request;
2234   Lisp_Object reg_usage;
2235   Lisp_Object tail;
2236   int reg94, reg96;
2237   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2238   int max_charset_id;
2239
2240   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2241   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2242       && ! EQ (charset_list, Viso_2022_charset_list))
2243     {
2244       CODING_ATTR_CHARSET_LIST (attrs)
2245         = charset_list = Viso_2022_charset_list;
2246       ASET (attrs, coding_attr_safe_charsets, Qnil);
2247     }
2248
2249   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2250     return;
2251
2252   max_charset_id = 0;
2253   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2254     {
2255       int id = XINT (XCAR (tail));
2256       if (max_charset_id < id)
2257         max_charset_id = id;
2258     }
2259
2260   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2261                                 make_number (255));
2262   request = AREF (attrs, coding_attr_iso_request);
2263   reg_usage = AREF (attrs, coding_attr_iso_usage);
2264   reg94 = XINT (XCAR (reg_usage));
2265   reg96 = XINT (XCDR (reg_usage));
2266
2267   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2268     {
2269       Lisp_Object id;
2270       Lisp_Object reg;
2271       struct charset *charset;
2272
2273       id = XCAR (tail);
2274       charset = CHARSET_FROM_ID (XINT (id));
2275       reg = Fcdr (Fassq (id, request));
2276       if (! NILP (reg))
2277         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2278       else if (charset->iso_chars_96)
2279         {
2280           if (reg96 < 4)
2281             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2282         }
2283       else
2284         {
2285           if (reg94 < 4)
2286             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2287         }
2288     }
2289   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2290 }
2291
2292
2293 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2294    Check if a text is encoded in ISO2022.  If it is, returns an
2295    integer in which appropriate flag bits any of:
2296         CATEGORY_MASK_ISO_7
2297         CATEGORY_MASK_ISO_7_TIGHT
2298         CATEGORY_MASK_ISO_8_1
2299         CATEGORY_MASK_ISO_8_2
2300         CATEGORY_MASK_ISO_7_ELSE
2301         CATEGORY_MASK_ISO_8_ELSE
2302    are set.  If a code which should never appear in ISO2022 is found,
2303    returns 0.  */
2304
2305 static int
2306 detect_coding_iso_2022 (coding, mask)
2307      struct coding_system *coding;
2308      int *mask;
2309 {
2310   unsigned char *src = coding->source, *src_base = src;
2311   unsigned char *src_end = coding->source + coding->src_bytes;
2312   int multibytep = coding->src_multibyte;
2313   int mask_iso = CATEGORY_MASK_ISO;
2314   int mask_found = 0, mask_8bit_found = 0;
2315   int reg[4], shift_out = 0, single_shifting = 0;
2316   int id;
2317   int c, c1;
2318   int consumed_chars = 0;
2319   int i;
2320
2321   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2322     {
2323       struct coding_system *this = &(coding_categories[i]);
2324       Lisp_Object attrs, val;
2325
2326       attrs = CODING_ID_ATTRS (this->id);
2327       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2328           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2329         setup_iso_safe_charsets (attrs);
2330       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2331       this->max_charset_id = XSTRING (val)->size - 1;
2332       this->safe_charsets = (char *) XSTRING (val)->data;
2333     }
2334
2335   /* A coding system of this category is always ASCII compatible.  */
2336   src += coding->head_ascii;
2337
2338   reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2339   while (mask_iso && src < src_end)
2340     {
2341       ONE_MORE_BYTE (c);
2342       switch (c)
2343         {
2344         case ISO_CODE_ESC:
2345           if (inhibit_iso_escape_detection)
2346             break;
2347           single_shifting = 0;
2348           ONE_MORE_BYTE (c);
2349           if (c >= '(' && c <= '/')
2350             {
2351               /* Designation sequence for a charset of dimension 1.  */
2352               ONE_MORE_BYTE (c1);
2353               if (c1 < ' ' || c1 >= 0x80
2354                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2355                 /* Invalid designation sequence.  Just ignore.  */
2356                 break;
2357               reg[(c - '(') % 4] = id;
2358             }
2359           else if (c == '$')
2360             {
2361               /* Designation sequence for a charset of dimension 2.  */
2362               ONE_MORE_BYTE (c);
2363               if (c >= '@' && c <= 'B')
2364                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2365                 reg[0] = id = iso_charset_table[1][0][c];
2366               else if (c >= '(' && c <= '/')
2367                 {
2368                   ONE_MORE_BYTE (c1);
2369                   if (c1 < ' ' || c1 >= 0x80
2370                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2371                     /* Invalid designation sequence.  Just ignore.  */
2372                     break;
2373                   reg[(c - '(') % 4] = id;
2374                 }
2375               else
2376                 /* Invalid designation sequence.  Just ignore.  */
2377                 break;
2378             }
2379           else if (c == 'N' || c == 'O')
2380             {
2381               /* ESC <Fe> for SS2 or SS3.  */
2382               mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2383               break;
2384             }
2385           else if (c >= '0' && c <= '4')
2386             {
2387               /* ESC <Fp> for start/end composition.  */
2388               mask_found |= CATEGORY_MASK_ISO;
2389               break;
2390             }
2391           else
2392             {
2393               /* Invalid escape sequence.  */
2394               mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2395               break;
2396             }
2397
2398           /* We found a valid designation sequence for CHARSET.  */
2399           mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2400           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2401                               id))
2402             mask_found |= CATEGORY_MASK_ISO_7;
2403           else
2404             mask_iso &= ~CATEGORY_MASK_ISO_7;
2405           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2406                               id))
2407             mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2408           else
2409             mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2410           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2411                               id))
2412             mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2413           else
2414             mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2415           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2416                               id))
2417             mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2418           else
2419             mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2420           break;
2421
2422         case ISO_CODE_SO:
2423           if (inhibit_iso_escape_detection)
2424             break;
2425           single_shifting = 0;
2426           if (shift_out == 0
2427               && (reg[1] >= 0
2428                   || SHIFT_OUT_OK (coding_category_iso_7_else)
2429                   || SHIFT_OUT_OK (coding_category_iso_8_else)))
2430             {
2431               /* Locking shift out.  */
2432               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2433               mask_found |= CATEGORY_MASK_ISO_ELSE;
2434             }
2435           break;
2436
2437         case ISO_CODE_SI:
2438           if (inhibit_iso_escape_detection)
2439             break;
2440           single_shifting = 0;
2441           if (shift_out == 1)
2442             {
2443               /* Locking shift in.  */
2444               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2445               mask_found |= CATEGORY_MASK_ISO_ELSE;
2446             }
2447           break;
2448
2449         case ISO_CODE_CSI:
2450           single_shifting = 0;
2451         case ISO_CODE_SS2:
2452         case ISO_CODE_SS3:
2453           {
2454             int newmask = CATEGORY_MASK_ISO_8_ELSE;
2455
2456             if (inhibit_iso_escape_detection)
2457               break;
2458             if (c != ISO_CODE_CSI)
2459               {
2460                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2461                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2462                   newmask |= CATEGORY_MASK_ISO_8_1;
2463                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2464                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2465                   newmask |= CATEGORY_MASK_ISO_8_2;
2466                 single_shifting = 1;
2467               }
2468             if (VECTORP (Vlatin_extra_code_table)
2469                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2470               {
2471                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2472                     & CODING_ISO_FLAG_LATIN_EXTRA)
2473                   newmask |= CATEGORY_MASK_ISO_8_1;
2474                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2475                     & CODING_ISO_FLAG_LATIN_EXTRA)
2476                   newmask |= CATEGORY_MASK_ISO_8_2;
2477               }
2478             mask_iso &= newmask;
2479             mask_found |= newmask;
2480           }
2481           break;
2482
2483         default:
2484           if (c < 0x80)
2485             {
2486               single_shifting = 0;
2487               break;
2488             }
2489           else if (c < 0xA0)
2490             {
2491               single_shifting = 0;
2492               mask_8bit_found = 1;
2493               if (VECTORP (Vlatin_extra_code_table)
2494                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2495                 {
2496                   int newmask = 0;
2497
2498                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2499                       & CODING_ISO_FLAG_LATIN_EXTRA)
2500                     newmask |= CATEGORY_MASK_ISO_8_1;
2501                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2502                       & CODING_ISO_FLAG_LATIN_EXTRA)
2503                     newmask |= CATEGORY_MASK_ISO_8_2;
2504                   mask_iso &= newmask;
2505                   mask_found |= newmask;
2506                 }
2507               else
2508                 return 0;
2509             }
2510           else
2511             {
2512               mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2513                             | CATEGORY_MASK_ISO_7_ELSE);
2514               mask_found |= CATEGORY_MASK_ISO_8_1;
2515               mask_8bit_found = 1;
2516               /* Check the length of succeeding codes of the range
2517                  0xA0..0FF.  If the byte length is odd, we exclude
2518                  CATEGORY_MASK_ISO_8_2.  We can check this only
2519                  when we are not single shifting.  */
2520               if (!single_shifting
2521                   && mask_iso & CATEGORY_MASK_ISO_8_2)
2522                 {
2523                   int i = 1;
2524                   while (src < src_end)
2525                     {
2526                       ONE_MORE_BYTE (c);
2527                       if (c < 0xA0)
2528                         break;
2529                       i++;
2530                     }
2531
2532                   if (i & 1 && src < src_end)
2533                     mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2534                   else
2535                     mask_found |= CATEGORY_MASK_ISO_8_2;
2536                 }
2537             }
2538           break;
2539         }
2540     }
2541  no_more_source:
2542   if (!mask_iso)
2543     {
2544       *mask &= ~CATEGORY_MASK_ISO;
2545       return 0;
2546     }
2547   if (!mask_found)
2548     return 0;
2549   *mask &= mask_iso & mask_found;
2550   if (! mask_8bit_found)
2551     *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2552   return 1;
2553 }
2554
2555
2556 /* Set designation state into CODING.  */
2557 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2558   do {                                                                  \
2559     int id, prev;                                                       \
2560                                                                         \
2561     if (final < '0' || final >= 128                                     \
2562         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2563         || !SAFE_CHARSET_P (coding, id))                                \
2564       {                                                                 \
2565         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2566         goto invalid_code;                                              \
2567       }                                                                 \
2568     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2569     if (id == charset_jisx0201_roman)                                   \
2570       {                                                                 \
2571         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2572           id = charset_ascii;                                           \
2573       }                                                                 \
2574     else if (id == charset_jisx0208_1978)                               \
2575       {                                                                 \
2576         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2577           id = charset_jisx0208;                                        \
2578       }                                                                 \
2579     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2580     /* If there was an invalid designation to REG previously, and this  \
2581        designation is ASCII to REG, we should keep this designation     \
2582        sequence.  */                                                    \
2583     if (prev == -2 && id == charset_ascii)                              \
2584       goto invalid_code;                                                \
2585   } while (0)
2586
2587
2588 #define MAYBE_FINISH_COMPOSITION()                              \
2589   do {                                                          \
2590     int i;                                                      \
2591     if (composition_state == COMPOSING_NO)                      \
2592       break;                                                    \
2593     /* It is assured that we have enough room for producing     \
2594        characters stored in the table `components'.  */         \
2595     if (charbuf + component_idx > charbuf_end)                  \
2596       goto no_more_source;                                      \
2597     composition_state = COMPOSING_NO;                           \
2598     if (method == COMPOSITION_RELATIVE                          \
2599         || method == COMPOSITION_WITH_ALTCHARS)                 \
2600       {                                                         \
2601         for (i = 0; i < component_idx; i++)                     \
2602           *charbuf++ = components[i];                           \
2603         char_offset += component_idx;                           \
2604       }                                                         \
2605     else                                                        \
2606       {                                                         \
2607         for (i = 0; i < component_idx; i += 2)                  \
2608           *charbuf++ = components[i];                           \
2609         char_offset += (component_idx / 2) + 1;                 \
2610       }                                                         \
2611   } while (0)
2612
2613
2614 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2615    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2616    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2617    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2618    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2619   */
2620
2621 #define DECODE_COMPOSITION_START(c1)                                    \
2622   do {                                                                  \
2623     if (c1 == '0'                                                       \
2624         && composition_state == COMPOSING_COMPONENT_RULE)               \
2625       {                                                                 \
2626         component_len = component_idx;                                  \
2627         composition_state = COMPOSING_CHAR;                             \
2628       }                                                                 \
2629     else                                                                \
2630       {                                                                 \
2631         unsigned char *p;                                               \
2632                                                                         \
2633         MAYBE_FINISH_COMPOSITION ();                                    \
2634         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2635           goto no_more_source;                                          \
2636         for (p = src; p < src_end - 1; p++)                             \
2637           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2638             break;                                                      \
2639         if (p == src_end - 1)                                           \
2640           {                                                             \
2641             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2642               goto invalid_code;                                        \
2643             goto no_more_source;                                        \
2644           }                                                             \
2645                                                                         \
2646         /* This is surely the start of a composition.  */               \
2647         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2648                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2649                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2650                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2651         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2652                              : COMPOSING_COMPONENT_CHAR);               \
2653         component_idx = component_len = 0;                              \
2654       }                                                                 \
2655   } while (0)
2656
2657
2658 /* Handle compositoin end sequence ESC 1.  */
2659
2660 #define DECODE_COMPOSITION_END()                                        \
2661   do {                                                                  \
2662     int nchars = (component_len > 0 ? component_idx - component_len     \
2663                   : method == COMPOSITION_RELATIVE ? component_idx      \
2664                   : (component_idx + 1) / 2);                           \
2665     int i;                                                              \
2666     int *saved_charbuf = charbuf;                                       \
2667                                                                         \
2668     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
2669     if (method != COMPOSITION_RELATIVE)                                 \
2670       {                                                                 \
2671         if (component_len == 0)                                         \
2672           for (i = 0; i < component_idx; i++)                           \
2673             *charbuf++ = components[i];                                 \
2674         else                                                            \
2675           for (i = 0; i < component_len; i++)                           \
2676             *charbuf++ = components[i];                                 \
2677         *saved_charbuf = saved_charbuf - charbuf;                       \
2678       }                                                                 \
2679     if (method == COMPOSITION_WITH_RULE)                                \
2680       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2681         *charbuf++ = components[i];                                     \
2682     else                                                                \
2683       for (i = component_len; i < component_idx; i++, char_offset++)    \
2684         *charbuf++ = components[i];                                     \
2685     coding->annotated = 1;                                              \
2686     composition_state = COMPOSING_NO;                                   \
2687   } while (0)
2688
2689
2690 /* Decode a composition rule from the byte C1 (and maybe one more byte
2691    from SRC) and store one encoded composition rule in
2692    coding->cmp_data.  */
2693
2694 #define DECODE_COMPOSITION_RULE(c1)                                     \
2695   do {                                                                  \
2696     (c1) -= 32;                                                         \
2697     if (c1 < 81)                /* old format (before ver.21) */        \
2698       {                                                                 \
2699         int gref = (c1) / 9;                                            \
2700         int nref = (c1) % 9;                                            \
2701         if (gref == 4) gref = 10;                                       \
2702         if (nref == 4) nref = 10;                                       \
2703         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2704       }                                                                 \
2705     else if (c1 < 93)           /* new format (after ver.21) */         \
2706       {                                                                 \
2707         ONE_MORE_BYTE (c2);                                             \
2708         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2709       }                                                                 \
2710     else                                                                \
2711       c1 = 0;                                                           \
2712   } while (0)
2713
2714
2715 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2716
2717 static void
2718 decode_coding_iso_2022 (coding)
2719      struct coding_system *coding;
2720 {
2721   unsigned char *src = coding->source + coding->consumed;
2722   unsigned char *src_end = coding->source + coding->src_bytes;
2723   unsigned char *src_base;
2724   int *charbuf = coding->charbuf;
2725   int *charbuf_end = charbuf + coding->charbuf_size - 4;
2726   int consumed_chars = 0, consumed_chars_base;
2727   int char_offset = 0;
2728   int multibytep = coding->src_multibyte;
2729   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2730   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2731   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2732   struct charset *charset;
2733   int c;
2734   /* For handling composition sequence.  */
2735 #define COMPOSING_NO                    0
2736 #define COMPOSING_CHAR                  1
2737 #define COMPOSING_RULE                  2
2738 #define COMPOSING_COMPONENT_CHAR        3
2739 #define COMPOSING_COMPONENT_RULE        4
2740
2741   int composition_state = COMPOSING_NO;
2742   enum composition_method method;
2743   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2744   int component_idx;
2745   int component_len;
2746   Lisp_Object attrs, eol_type, charset_list;
2747
2748   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2749   setup_iso_safe_charsets (attrs);
2750
2751   while (1)
2752     {
2753       int c1, c2;
2754
2755       src_base = src;
2756       consumed_chars_base = consumed_chars;
2757
2758       if (charbuf >= charbuf_end)
2759         break;
2760
2761       ONE_MORE_BYTE (c1);
2762
2763       /* We produce no character or one character.  */
2764       switch (iso_code_class [c1])
2765         {
2766         case ISO_0x20_or_0x7F:
2767           if (composition_state != COMPOSING_NO)
2768             {
2769               if (composition_state == COMPOSING_RULE
2770                   || composition_state == COMPOSING_COMPONENT_RULE)
2771                 {
2772                   DECODE_COMPOSITION_RULE (c1);
2773                   components[component_idx++] = c1;
2774                   composition_state--;
2775                   continue;
2776                 }
2777             }
2778           if (charset_id_0 < 0
2779               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2780             /* This is SPACE or DEL.  */
2781             charset = CHARSET_FROM_ID (charset_ascii);
2782           else
2783             charset = CHARSET_FROM_ID (charset_id_0);
2784           break;
2785
2786         case ISO_graphic_plane_0:
2787           if (composition_state != COMPOSING_NO)
2788             {
2789               if (composition_state == COMPOSING_RULE
2790                   || composition_state == COMPOSING_COMPONENT_RULE)
2791                 {
2792                   DECODE_COMPOSITION_RULE (c1);
2793                   components[component_idx++] = c1;
2794                   composition_state--;
2795                   continue;
2796                 }
2797             }
2798           charset = CHARSET_FROM_ID (charset_id_0);
2799           break;
2800
2801         case ISO_0xA0_or_0xFF:
2802           if (charset_id_1 < 0
2803               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2804               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2805             goto invalid_code;
2806           /* This is a graphic character, we fall down ... */
2807
2808         case ISO_graphic_plane_1:
2809           if (charset_id_1 < 0)
2810             goto invalid_code;
2811           charset = CHARSET_FROM_ID (charset_id_1);
2812           break;
2813
2814         case ISO_carriage_return:
2815           if (c1 == '\r')
2816             {
2817               if (EQ (eol_type, Qdos))
2818                 {
2819                   if (src == src_end)
2820                     goto no_more_source;
2821                   if (*src == '\n')
2822                     ONE_MORE_BYTE (c1);
2823                 }
2824               else if (EQ (eol_type, Qmac))
2825                 c1 = '\n';
2826             }
2827           /* fall through */
2828
2829         case ISO_control_0:
2830           MAYBE_FINISH_COMPOSITION ();
2831           charset = CHARSET_FROM_ID (charset_ascii);
2832           break;
2833
2834         case ISO_control_1:
2835           MAYBE_FINISH_COMPOSITION ();
2836           goto invalid_code;
2837
2838         case ISO_shift_out:
2839           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2840               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2841             goto invalid_code;
2842           CODING_ISO_INVOCATION (coding, 0) = 1;
2843           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2844           continue;
2845
2846         case ISO_shift_in:
2847           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2848             goto invalid_code;
2849           CODING_ISO_INVOCATION (coding, 0) = 0;
2850           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2851           continue;
2852
2853         case ISO_single_shift_2_7:
2854         case ISO_single_shift_2:
2855           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2856             goto invalid_code;
2857           /* SS2 is handled as an escape sequence of ESC 'N' */
2858           c1 = 'N';
2859           goto label_escape_sequence;
2860
2861         case ISO_single_shift_3:
2862           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2863             goto invalid_code;
2864           /* SS2 is handled as an escape sequence of ESC 'O' */
2865           c1 = 'O';
2866           goto label_escape_sequence;
2867
2868         case ISO_control_sequence_introducer:
2869           /* CSI is handled as an escape sequence of ESC '[' ...  */
2870           c1 = '[';
2871           goto label_escape_sequence;
2872
2873         case ISO_escape:
2874           ONE_MORE_BYTE (c1);
2875         label_escape_sequence:
2876           /* Escape sequences handled here are invocation,
2877              designation, direction specification, and character
2878              composition specification.  */
2879           switch (c1)
2880             {
2881             case '&':           /* revision of following character set */
2882               ONE_MORE_BYTE (c1);
2883               if (!(c1 >= '@' && c1 <= '~'))
2884                 goto invalid_code;
2885               ONE_MORE_BYTE (c1);
2886               if (c1 != ISO_CODE_ESC)
2887                 goto invalid_code;
2888               ONE_MORE_BYTE (c1);
2889               goto label_escape_sequence;
2890
2891             case '$':           /* designation of 2-byte character set */
2892               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2893                 goto invalid_code;
2894               ONE_MORE_BYTE (c1);
2895               if (c1 >= '@' && c1 <= 'B')
2896                 {       /* designation of JISX0208.1978, GB2312.1980,
2897                            or JISX0208.1980 */
2898                   DECODE_DESIGNATION (0, 2, 0, c1);
2899                 }
2900               else if (c1 >= 0x28 && c1 <= 0x2B)
2901                 {       /* designation of DIMENSION2_CHARS94 character set */
2902                   ONE_MORE_BYTE (c2);
2903                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2904                 }
2905               else if (c1 >= 0x2C && c1 <= 0x2F)
2906                 {       /* designation of DIMENSION2_CHARS96 character set */
2907                   ONE_MORE_BYTE (c2);
2908                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2909                 }
2910               else
2911                 goto invalid_code;
2912               /* We must update these variables now.  */
2913               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2914               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2915               continue;
2916
2917             case 'n':           /* invocation of locking-shift-2 */
2918               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2919                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2920                 goto invalid_code;
2921               CODING_ISO_INVOCATION (coding, 0) = 2;
2922               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2923               continue;
2924
2925             case 'o':           /* invocation of locking-shift-3 */
2926               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2927                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2928                 goto invalid_code;
2929               CODING_ISO_INVOCATION (coding, 0) = 3;
2930               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2931               continue;
2932
2933             case 'N':           /* invocation of single-shift-2 */
2934               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2935                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2936                 goto invalid_code;
2937               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2938               ONE_MORE_BYTE (c1);
2939               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2940                 goto invalid_code;
2941               break;
2942
2943             case 'O':           /* invocation of single-shift-3 */
2944               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2945                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2946                 goto invalid_code;
2947               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2948               ONE_MORE_BYTE (c1);
2949               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2950                 goto invalid_code;
2951               break;
2952
2953             case '0': case '2': case '3': case '4': /* start composition */
2954               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2955                 goto invalid_code;
2956               DECODE_COMPOSITION_START (c1);
2957               continue;
2958
2959             case '1':           /* end composition */
2960               if (composition_state == COMPOSING_NO)
2961                 goto invalid_code;
2962               DECODE_COMPOSITION_END ();
2963               continue;
2964
2965             case '[':           /* specification of direction */
2966               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2967                 goto invalid_code;
2968               /* For the moment, nested direction is not supported.
2969                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2970                  left-to-right, and nozero means right-to-left.  */
2971               ONE_MORE_BYTE (c1);
2972               switch (c1)
2973                 {
2974                 case ']':       /* end of the current direction */
2975                   coding->mode &= ~CODING_MODE_DIRECTION;
2976
2977                 case '0':       /* end of the current direction */
2978                 case '1':       /* start of left-to-right direction */
2979                   ONE_MORE_BYTE (c1);
2980                   if (c1 == ']')
2981                     coding->mode &= ~CODING_MODE_DIRECTION;
2982                   else
2983                     goto invalid_code;
2984                   break;
2985
2986                 case '2':       /* start of right-to-left direction */
2987                   ONE_MORE_BYTE (c1);
2988                   if (c1 == ']')
2989                     coding->mode |= CODING_MODE_DIRECTION;
2990                   else
2991                     goto invalid_code;
2992                   break;
2993
2994                 default:
2995                   goto invalid_code;
2996                 }
2997               continue;
2998
2999             default:
3000               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3001                 goto invalid_code;
3002               if (c1 >= 0x28 && c1 <= 0x2B)
3003                 {       /* designation of DIMENSION1_CHARS94 character set */
3004                   ONE_MORE_BYTE (c2);
3005                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3006                 }
3007               else if (c1 >= 0x2C && c1 <= 0x2F)
3008                 {       /* designation of DIMENSION1_CHARS96 character set */
3009                   ONE_MORE_BYTE (c2);
3010                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3011                 }
3012               else
3013                 goto invalid_code;
3014               /* We must update these variables now.  */
3015               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3016               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3017               continue;
3018             }
3019         }
3020
3021       /* Now we know CHARSET and 1st position code C1 of a character.
3022          Produce a decoded character while getting 2nd position code
3023          C2 if necessary.  */
3024       c1 &= 0x7F;
3025       if (CHARSET_DIMENSION (charset) > 1)
3026         {
3027           ONE_MORE_BYTE (c2);
3028           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3029             /* C2 is not in a valid range.  */
3030             goto invalid_code;
3031           c1 = (c1 << 8) | (c2 & 0x7F);
3032           if (CHARSET_DIMENSION (charset) > 2)
3033             {
3034               ONE_MORE_BYTE (c2);
3035               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3036                 /* C2 is not in a valid range.  */
3037                 goto invalid_code;
3038               c1 = (c1 << 8) | (c2 & 0x7F);
3039             }
3040         }
3041
3042       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3043       if (c < 0)
3044         {
3045           MAYBE_FINISH_COMPOSITION ();
3046           for (; src_base < src; src_base++, char_offset++)
3047             {
3048               if (ASCII_BYTE_P (*src_base))
3049                 *charbuf++ = *src_base;
3050               else
3051                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3052             }
3053         }
3054       else if (composition_state == COMPOSING_NO)
3055         {
3056           *charbuf++ = c;
3057           char_offset++;
3058         }
3059       else
3060         {
3061           components[component_idx++] = c;
3062           if (method == COMPOSITION_WITH_RULE
3063               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3064                   && composition_state == COMPOSING_COMPONENT_CHAR))
3065             composition_state++;
3066         }
3067       continue;
3068
3069     invalid_code:
3070       MAYBE_FINISH_COMPOSITION ();
3071       src = src_base;
3072       consumed_chars = consumed_chars_base;
3073       ONE_MORE_BYTE (c);
3074       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3075       coding->errors++;
3076     }
3077
3078  no_more_source:
3079   coding->consumed_char += consumed_chars_base;
3080   coding->consumed = src_base - coding->source;
3081   coding->charbuf_used = charbuf - coding->charbuf;
3082 }
3083
3084
3085 /* ISO2022 encoding stuff.  */
3086
3087 /*
3088    It is not enough to say just "ISO2022" on encoding, we have to
3089    specify more details.  In Emacs, each coding system of ISO2022
3090    variant has the following specifications:
3091         1. Initial designation to G0 thru G3.
3092         2. Allows short-form designation?
3093         3. ASCII should be designated to G0 before control characters?
3094         4. ASCII should be designated to G0 at end of line?
3095         5. 7-bit environment or 8-bit environment?
3096         6. Use locking-shift?
3097         7. Use Single-shift?
3098    And the following two are only for Japanese:
3099         8. Use ASCII in place of JIS0201-1976-Roman?
3100         9. Use JISX0208-1983 in place of JISX0208-1978?
3101    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3102    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3103    details.
3104 */
3105
3106 /* Produce codes (escape sequence) for designating CHARSET to graphic
3107    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3108    '@', 'A', or 'B' and the coding system CODING allows, produce
3109    designation sequence of short-form.  */
3110
3111 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3112   do {                                                                  \
3113     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3114     char *intermediate_char_94 = "()*+";                                \
3115     char *intermediate_char_96 = ",-./";                                \
3116     int revision = -1;                                                  \
3117     int c;                                                              \
3118                                                                         \
3119     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3120       revision = XINT (CHARSET_ISO_REVISION (charset));                 \
3121                                                                         \
3122     if (revision >= 0)                                                  \
3123       {                                                                 \
3124         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3125         EMIT_ONE_BYTE ('@' + revision);                                 \
3126       }                                                                 \
3127     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3128     if (CHARSET_DIMENSION (charset) == 1)                               \
3129       {                                                                 \
3130         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3131           c = intermediate_char_94[reg];                                \
3132         else                                                            \
3133           c = intermediate_char_96[reg];                                \
3134         EMIT_ONE_ASCII_BYTE (c);                                        \
3135       }                                                                 \
3136     else                                                                \
3137       {                                                                 \
3138         EMIT_ONE_ASCII_BYTE ('$');                                      \
3139         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3140           {                                                             \
3141             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3142                 || reg != 0                                             \
3143                 || final_char < '@' || final_char > 'B')                \
3144               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3145           }                                                             \
3146         else                                                            \
3147           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3148       }                                                                 \
3149     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3150                                                                         \
3151     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3152   } while (0)
3153
3154
3155 /* The following two macros produce codes (control character or escape
3156    sequence) for ISO2022 single-shift functions (single-shift-2 and
3157    single-shift-3).  */
3158
3159 #define ENCODE_SINGLE_SHIFT_2                                           \
3160   do {                                                                  \
3161     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3162       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3163     else                                                                \
3164       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3165     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3166   } while (0)
3167
3168
3169 #define ENCODE_SINGLE_SHIFT_3                                           \
3170   do {                                                                  \
3171     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3172       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3173     else                                                                \
3174       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3175     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3176   } while (0)
3177
3178
3179 /* The following four macros produce codes (control character or
3180    escape sequence) for ISO2022 locking-shift functions (shift-in,
3181    shift-out, locking-shift-2, and locking-shift-3).  */
3182
3183 #define ENCODE_SHIFT_IN                                 \
3184   do {                                                  \
3185     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3186     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3187   } while (0)
3188
3189
3190 #define ENCODE_SHIFT_OUT                                \
3191   do {                                                  \
3192     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3193     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3194   } while (0)
3195
3196
3197 #define ENCODE_LOCKING_SHIFT_2                          \
3198   do {                                                  \
3199     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3200     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3201   } while (0)
3202
3203
3204 #define ENCODE_LOCKING_SHIFT_3                          \
3205   do {                                                  \
3206     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3207     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3208   } while (0)
3209
3210
3211 /* Produce codes for a DIMENSION1 character whose character set is
3212    CHARSET and whose position-code is C1.  Designation and invocation
3213    sequences are also produced in advance if necessary.  */
3214
3215 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3216   do {                                                                  \
3217     int id = CHARSET_ID (charset);                                      \
3218                                                                         \
3219     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3220         && id == charset_ascii)                                         \
3221       {                                                                 \
3222         id = charset_jisx0201_roman;                                    \
3223         charset = CHARSET_FROM_ID (id);                                 \
3224       }                                                                 \
3225                                                                         \
3226     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3227       {                                                                 \
3228         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3229           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3230         else                                                            \
3231           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3232         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3233         break;                                                          \
3234       }                                                                 \
3235     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3236       {                                                                 \
3237         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3238         break;                                                          \
3239       }                                                                 \
3240     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3241       {                                                                 \
3242         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3243         break;                                                          \
3244       }                                                                 \
3245     else                                                                \
3246       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3247          must invoke it, or, at first, designate it to some graphic     \
3248          register.  Then repeat the loop to actually produce the        \
3249          character.  */                                                 \
3250       dst = encode_invocation_designation (charset, coding, dst,        \
3251                                            &produced_chars);            \
3252   } while (1)
3253
3254
3255 /* Produce codes for a DIMENSION2 character whose character set is
3256    CHARSET and whose position-codes are C1 and C2.  Designation and
3257    invocation codes are also produced in advance if necessary.  */
3258
3259 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3260   do {                                                                  \
3261     int id = CHARSET_ID (charset);                                      \
3262                                                                         \
3263     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3264         && id == charset_jisx0208)                                      \
3265       {                                                                 \
3266         id = charset_jisx0208_1978;                                     \
3267         charset = CHARSET_FROM_ID (id);                                 \
3268       }                                                                 \
3269                                                                         \
3270     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3271       {                                                                 \
3272         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3273           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3274         else                                                            \
3275           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3276         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3277         break;                                                          \
3278       }                                                                 \
3279     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3280       {                                                                 \
3281         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3282         break;                                                          \
3283       }                                                                 \
3284     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3285       {                                                                 \
3286         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3287         break;                                                          \
3288       }                                                                 \
3289     else                                                                \
3290       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3291          must invoke it, or, at first, designate it to some graphic     \
3292          register.  Then repeat the loop to actually produce the        \
3293          character.  */                                                 \
3294       dst = encode_invocation_designation (charset, coding, dst,        \
3295                                            &produced_chars);            \
3296   } while (1)
3297
3298
3299 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3300   do {                                                                     \
3301     int code = ENCODE_CHAR ((charset),(c));                                \
3302                                                                            \
3303     if (CHARSET_DIMENSION (charset) == 1)                                  \
3304       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3305     else                                                                   \
3306       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3307   } while (0)
3308
3309
3310 /* Produce designation and invocation codes at a place pointed by DST
3311    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3312    Return new DST.  */
3313
3314 unsigned char *
3315 encode_invocation_designation (charset, coding, dst, p_nchars)
3316      struct charset *charset;
3317      struct coding_system *coding;
3318      unsigned char *dst;
3319      int *p_nchars;
3320 {
3321   int multibytep = coding->dst_multibyte;
3322   int produced_chars = *p_nchars;
3323   int reg;                      /* graphic register number */
3324   int id = CHARSET_ID (charset);
3325
3326   /* At first, check designations.  */
3327   for (reg = 0; reg < 4; reg++)
3328     if (id == CODING_ISO_DESIGNATION (coding, reg))
3329       break;
3330
3331   if (reg >= 4)
3332     {
3333       /* CHARSET is not yet designated to any graphic registers.  */
3334       /* At first check the requested designation.  */
3335       reg = CODING_ISO_REQUEST (coding, id);
3336       if (reg < 0)
3337         /* Since CHARSET requests no special designation, designate it
3338            to graphic register 0.  */
3339         reg = 0;
3340
3341       ENCODE_DESIGNATION (charset, reg, coding);
3342     }
3343
3344   if (CODING_ISO_INVOCATION (coding, 0) != reg
3345       && CODING_ISO_INVOCATION (coding, 1) != reg)
3346     {
3347       /* Since the graphic register REG is not invoked to any graphic
3348          planes, invoke it to graphic plane 0.  */
3349       switch (reg)
3350         {
3351         case 0:                 /* graphic register 0 */
3352           ENCODE_SHIFT_IN;
3353           break;
3354
3355         case 1:                 /* graphic register 1 */
3356           ENCODE_SHIFT_OUT;
3357           break;
3358
3359         case 2:                 /* graphic register 2 */
3360           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3361             ENCODE_SINGLE_SHIFT_2;
3362           else
3363             ENCODE_LOCKING_SHIFT_2;
3364           break;
3365
3366         case 3:                 /* graphic register 3 */
3367           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3368             ENCODE_SINGLE_SHIFT_3;
3369           else
3370             ENCODE_LOCKING_SHIFT_3;
3371           break;
3372         }
3373     }
3374
3375   *p_nchars = produced_chars;
3376   return dst;
3377 }
3378
3379 /* The following three macros produce codes for indicating direction
3380    of text.  */
3381 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3382   do {                                                                  \
3383     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3384       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3385     else                                                                \
3386       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3387   } while (0)
3388
3389
3390 #define ENCODE_DIRECTION_R2L()                  \
3391   do {                                          \
3392     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3393     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3394   } while (0)
3395
3396
3397 #define ENCODE_DIRECTION_L2R()                  \
3398   do {                                          \
3399     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3400     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3401   } while (0)
3402
3403
3404 /* Produce codes for designation and invocation to reset the graphic
3405    planes and registers to initial state.  */
3406 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3407   do {                                                                  \
3408     int reg;                                                            \
3409     struct charset *charset;                                            \
3410                                                                         \
3411     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3412       ENCODE_SHIFT_IN;                                                  \
3413     for (reg = 0; reg < 4; reg++)                                       \
3414       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3415           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3416               != CODING_ISO_INITIAL (coding, reg)))                     \
3417         {                                                               \
3418           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3419           ENCODE_DESIGNATION (charset, reg, coding);                    \
3420         }                                                               \
3421   } while (0)
3422
3423
3424 /* Produce designation sequences of charsets in the line started from
3425    SRC to a place pointed by DST, and return updated DST.
3426
3427    If the current block ends before any end-of-line, we may fail to
3428    find all the necessary designations.  */
3429
3430 static unsigned char *
3431 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3432      struct coding_system *coding;
3433      int *charbuf, *charbuf_end;
3434      unsigned char *dst;
3435 {
3436   struct charset *charset;
3437   /* Table of charsets to be designated to each graphic register.  */
3438   int r[4];
3439   int c, found = 0, reg;
3440   int produced_chars = 0;
3441   int multibytep = coding->dst_multibyte;
3442   Lisp_Object attrs;
3443   Lisp_Object charset_list;
3444
3445   attrs = CODING_ID_ATTRS (coding->id);
3446   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3447   if (EQ (charset_list, Qiso_2022))
3448     charset_list = Viso_2022_charset_list;
3449
3450   for (reg = 0; reg < 4; reg++)
3451     r[reg] = -1;
3452
3453   while (found < 4)
3454     {
3455       int id;
3456
3457       c = *charbuf++;
3458       if (c == '\n')
3459         break;
3460       charset = char_charset (c, charset_list, NULL);
3461       id = CHARSET_ID (charset);
3462       reg = CODING_ISO_REQUEST (coding, id);
3463       if (reg >= 0 && r[reg] < 0)
3464         {
3465           found++;
3466           r[reg] = id;
3467         }
3468     }
3469
3470   if (found)
3471     {
3472       for (reg = 0; reg < 4; reg++)
3473         if (r[reg] >= 0
3474             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3475           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3476     }
3477
3478   return dst;
3479 }
3480
3481 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3482
3483 static int
3484 encode_coding_iso_2022 (coding)
3485      struct coding_system *coding;
3486 {
3487   int multibytep = coding->dst_multibyte;
3488   int *charbuf = coding->charbuf;
3489   int *charbuf_end = charbuf + coding->charbuf_used;
3490   unsigned char *dst = coding->destination + coding->produced;
3491   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3492   int safe_room = 16;
3493   int bol_designation
3494     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3495        && CODING_ISO_BOL (coding));
3496   int produced_chars = 0;
3497   Lisp_Object attrs, eol_type, charset_list;
3498   int ascii_compatible;
3499   int c;
3500
3501   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3502   setup_iso_safe_charsets (attrs);
3503   coding->safe_charsets
3504     = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3505
3506   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3507
3508   while (charbuf < charbuf_end)
3509     {
3510       ASSURE_DESTINATION (safe_room);
3511
3512       if (bol_designation)
3513         {
3514           unsigned char *dst_prev = dst;
3515
3516           /* We have to produce designation sequences if any now.  */
3517           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3518           bol_designation = 0;
3519           /* We are sure that designation sequences are all ASCII bytes.  */
3520           produced_chars += dst - dst_prev;
3521         }
3522
3523       c = *charbuf++;
3524
3525       /* Now encode the character C.  */
3526       if (c < 0x20 || c == 0x7F)
3527         {
3528           if (c == '\n'
3529               || (c == '\r' && EQ (eol_type, Qmac)))
3530             {
3531               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3532                 ENCODE_RESET_PLANE_AND_REGISTER ();
3533               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3534                 {
3535                   int i;
3536
3537                   for (i = 0; i < 4; i++)
3538                     CODING_ISO_DESIGNATION (coding, i)
3539                       = CODING_ISO_INITIAL (coding, i);
3540                 }
3541               bol_designation
3542                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3543             }
3544           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3545             ENCODE_RESET_PLANE_AND_REGISTER ();
3546           EMIT_ONE_ASCII_BYTE (c);
3547         }
3548       else if (ASCII_CHAR_P (c))
3549         {
3550           if (ascii_compatible)
3551             EMIT_ONE_ASCII_BYTE (c);
3552           else
3553             {
3554               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3555               ENCODE_ISO_CHARACTER (charset, c);
3556             }
3557         }
3558       else
3559         {
3560           struct charset *charset = char_charset (c, charset_list, NULL);
3561
3562           if (!charset)
3563             {
3564               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3565                 {
3566                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3567                   charset = CHARSET_FROM_ID (charset_ascii);
3568                 }
3569               else
3570                 {
3571                   c = coding->default_char;
3572                   charset = char_charset (c, charset_list, NULL);
3573                 }
3574             }
3575           ENCODE_ISO_CHARACTER (charset, c);
3576         }
3577     }
3578
3579   if (coding->mode & CODING_MODE_LAST_BLOCK
3580       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3581     {
3582       ASSURE_DESTINATION (safe_room);
3583       ENCODE_RESET_PLANE_AND_REGISTER ();
3584     }
3585   coding->result = CODING_RESULT_SUCCESS;
3586   CODING_ISO_BOL (coding) = bol_designation;
3587   coding->produced_char += produced_chars;
3588   coding->produced = dst - coding->destination;
3589   return 0;
3590 }
3591
3592 \f
3593 /*** 8,9. SJIS and BIG5 handlers ***/
3594
3595 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3596    quite widely.  So, for the moment, Emacs supports them in the bare
3597    C code.  But, in the future, they may be supported only by CCL.  */
3598
3599 /* SJIS is a coding system encoding three character sets: ASCII, right
3600    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3601    as is.  A character of charset katakana-jisx0201 is encoded by
3602    "position-code + 0x80".  A character of charset japanese-jisx0208
3603    is encoded in 2-byte but two position-codes are divided and shifted
3604    so that it fit in the range below.
3605
3606    --- CODE RANGE of SJIS ---
3607    (character set)      (range)
3608    ASCII                0x00 .. 0x7F
3609    KATAKANA-JISX0201    0xA0 .. 0xDF
3610    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3611             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3612    -------------------------------
3613
3614 */
3615
3616 /* BIG5 is a coding system encoding two character sets: ASCII and
3617    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3618    character set and is encoded in two-byte.
3619
3620    --- CODE RANGE of BIG5 ---
3621    (character set)      (range)
3622    ASCII                0x00 .. 0x7F
3623    Big5 (1st byte)      0xA1 .. 0xFE
3624         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3625    --------------------------
3626
3627   */
3628
3629 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3630    Check if a text is encoded in SJIS.  If it is, return
3631    CATEGORY_MASK_SJIS, else return 0.  */
3632
3633 static int
3634 detect_coding_sjis (coding, mask)
3635      struct coding_system *coding;
3636      int *mask;
3637 {
3638   unsigned char *src = coding->source, *src_base = src;
3639   unsigned char *src_end = coding->source + coding->src_bytes;
3640   int multibytep = coding->src_multibyte;
3641   int consumed_chars = 0;
3642   int found = 0;
3643   int c;
3644
3645   /* A coding system of this category is always ASCII compatible.  */
3646   src += coding->head_ascii;
3647
3648   while (1)
3649     {
3650       ONE_MORE_BYTE (c);
3651       if (c < 0x80)
3652         continue;
3653       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3654         {
3655           ONE_MORE_BYTE (c);
3656           if (c < 0x40 || c == 0x7F || c > 0xFC)
3657             break;
3658           found = 1;
3659         }
3660       else if (c >= 0xA0 && c < 0xE0)
3661         found = 1;
3662       else
3663         break;
3664     }
3665   *mask &= ~CATEGORY_MASK_SJIS;
3666   return 0;
3667
3668  no_more_source:
3669   if (!found)
3670     return 0;
3671   *mask &= CATEGORY_MASK_SJIS;
3672   return 1;
3673 }
3674
3675 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3676    Check if a text is encoded in BIG5.  If it is, return
3677    CATEGORY_MASK_BIG5, else return 0.  */
3678
3679 static int
3680 detect_coding_big5 (coding, mask)
3681      struct coding_system *coding;
3682      int *mask;
3683 {
3684   unsigned char *src = coding->source, *src_base = src;
3685   unsigned char *src_end = coding->source + coding->src_bytes;
3686   int multibytep = coding->src_multibyte;
3687   int consumed_chars = 0;
3688   int found = 0;
3689   int c;
3690
3691   /* A coding system of this category is always ASCII compatible.  */
3692   src += coding->head_ascii;
3693
3694   while (1)
3695     {
3696       ONE_MORE_BYTE (c);
3697       if (c < 0x80)
3698         continue;
3699       if (c >= 0xA1)
3700         {
3701           ONE_MORE_BYTE (c);
3702           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3703             return 0;
3704           found = 1;
3705         }
3706       else
3707         break;
3708     }
3709   *mask &= ~CATEGORY_MASK_BIG5;
3710   return 0;
3711
3712  no_more_source:
3713   if (!found)
3714     return 0;
3715   *mask &= CATEGORY_MASK_BIG5;
3716   return 1;
3717 }
3718
3719 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3720    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3721
3722 static void
3723 decode_coding_sjis (coding)
3724      struct coding_system *coding;
3725 {
3726   unsigned char *src = coding->source + coding->consumed;
3727   unsigned char *src_end = coding->source + coding->src_bytes;
3728   unsigned char *src_base;
3729   int *charbuf = coding->charbuf;
3730   int *charbuf_end = charbuf + coding->charbuf_size;
3731   int consumed_chars = 0, consumed_chars_base;
3732   int multibytep = coding->src_multibyte;
3733   struct charset *charset_roman, *charset_kanji, *charset_kana;
3734   Lisp_Object attrs, eol_type, charset_list, val;
3735
3736   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3737
3738   val = charset_list;
3739   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3740   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3741   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3742
3743   while (1)
3744     {
3745       int c, c1;
3746
3747       src_base = src;
3748       consumed_chars_base = consumed_chars;
3749
3750       if (charbuf >= charbuf_end)
3751         break;
3752
3753       ONE_MORE_BYTE (c);
3754
3755       if (c == '\r')
3756         {
3757           if (EQ (eol_type, Qdos))
3758             {
3759               if (src == src_end)
3760                 goto no_more_source;
3761               if (*src == '\n')
3762                 ONE_MORE_BYTE (c);
3763             }
3764           else if (EQ (eol_type, Qmac))
3765             c = '\n';
3766         }
3767       else
3768         {
3769           struct charset *charset;
3770
3771           if (c < 0x80)
3772             charset = charset_roman;
3773           else
3774             {
3775               if (c >= 0xF0)
3776                 goto invalid_code;
3777               if (c < 0xA0 || c >= 0xE0)
3778                 {
3779                   /* SJIS -> JISX0208 */
3780                   ONE_MORE_BYTE (c1);
3781                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3782                     goto invalid_code;
3783                   c = (c << 8) | c1;
3784                   SJIS_TO_JIS (c);
3785                   charset = charset_kanji;
3786                 }
3787               else
3788                 /* SJIS -> JISX0201-Kana */
3789                 charset = charset_kana;
3790             }
3791           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3792         }
3793       *charbuf++ = c;
3794       continue;
3795
3796     invalid_code:
3797       src = src_base;
3798       consumed_chars = consumed_chars_base;
3799       ONE_MORE_BYTE (c);
3800       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3801       coding->errors++;
3802     }
3803
3804  no_more_source:
3805   coding->consumed_char += consumed_chars_base;
3806   coding->consumed = src_base - coding->source;
3807   coding->charbuf_used = charbuf - coding->charbuf;
3808 }
3809
3810 static void
3811 decode_coding_big5 (coding)
3812      struct coding_system *coding;
3813 {
3814   unsigned char *src = coding->source + coding->consumed;
3815   unsigned char *src_end = coding->source + coding->src_bytes;
3816   unsigned char *src_base;
3817   int *charbuf = coding->charbuf;
3818   int *charbuf_end = charbuf + coding->charbuf_size;
3819   int consumed_chars = 0, consumed_chars_base;
3820   int multibytep = coding->src_multibyte;
3821   struct charset *charset_roman, *charset_big5;
3822   Lisp_Object attrs, eol_type, charset_list, val;
3823
3824   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3825   val = charset_list;
3826   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3827   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3828
3829   while (1)
3830     {
3831       int c, c1;
3832
3833       src_base = src;
3834       consumed_chars_base = consumed_chars;
3835
3836       if (charbuf >= charbuf_end)
3837         break;
3838
3839       ONE_MORE_BYTE (c);
3840
3841       if (c == '\r')
3842         {
3843           if (EQ (eol_type, Qdos))
3844             {
3845               if (src == src_end)
3846                 goto no_more_source;
3847               if (*src == '\n')
3848                 ONE_MORE_BYTE (c);
3849             }
3850           else if (EQ (eol_type, Qmac))
3851             c = '\n';
3852         }
3853       else
3854         {
3855           struct charset *charset;
3856           if (c < 0x80)
3857             charset = charset_roman;
3858           else
3859             {
3860               /* BIG5 -> Big5 */
3861               if (c < 0xA1 || c > 0xFE)
3862                 goto invalid_code;
3863               ONE_MORE_BYTE (c1);
3864               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3865                 goto invalid_code;
3866               c = c << 8 | c1;
3867               charset = charset_big5;
3868             }
3869           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3870         }
3871
3872       *charbuf++ = c;
3873       continue;
3874
3875     invalid_code:
3876       src = src_base;
3877       consumed_chars = consumed_chars_base;
3878       ONE_MORE_BYTE (c);
3879       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3880       coding->errors++;
3881     }
3882
3883  no_more_source:
3884   coding->consumed_char += consumed_chars_base;
3885   coding->consumed = src_base - coding->source;
3886   coding->charbuf_used = charbuf - coding->charbuf;
3887 }
3888
3889 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3890    This function can encode charsets `ascii', `katakana-jisx0201',
3891    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3892    are sure that all these charsets are registered as official charset
3893    (i.e. do not have extended leading-codes).  Characters of other
3894    charsets are produced without any encoding.  If SJIS_P is 1, encode
3895    SJIS text, else encode BIG5 text.  */
3896
3897 static int
3898 encode_coding_sjis (coding)
3899      struct coding_system *coding;
3900 {
3901   int multibytep = coding->dst_multibyte;
3902   int *charbuf = coding->charbuf;
3903   int *charbuf_end = charbuf + coding->charbuf_used;
3904   unsigned char *dst = coding->destination + coding->produced;
3905   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3906   int safe_room = 4;
3907   int produced_chars = 0;
3908   Lisp_Object attrs, eol_type, charset_list, val;
3909   int ascii_compatible;
3910   struct charset *charset_roman, *charset_kanji, *charset_kana;
3911   int c;
3912
3913   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3914   val = charset_list;
3915   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3916   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3917   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3918
3919   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3920
3921   while (charbuf < charbuf_end)
3922     {
3923       ASSURE_DESTINATION (safe_room);
3924       c = *charbuf++;
3925       /* Now encode the character C.  */
3926       if (ASCII_CHAR_P (c) && ascii_compatible)
3927         EMIT_ONE_ASCII_BYTE (c);
3928       else
3929         {
3930           unsigned code;
3931           struct charset *charset = char_charset (c, charset_list, &code);
3932
3933           if (!charset)
3934             {
3935               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3936                 {
3937                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3938                   charset = CHARSET_FROM_ID (charset_ascii);
3939                 }
3940               else
3941                 {
3942                   c = coding->default_char;
3943                   charset = char_charset (c, charset_list, &code);
3944                 }
3945             }
3946           if (code == CHARSET_INVALID_CODE (charset))
3947             abort ();
3948           if (charset == charset_kanji)
3949             {
3950               int c1, c2;
3951               JIS_TO_SJIS (code);
3952               c1 = code >> 8, c2 = code & 0xFF;
3953               EMIT_TWO_BYTES (c1, c2);
3954             }
3955           else if (charset == charset_kana)
3956             EMIT_ONE_BYTE (code | 0x80);
3957           else
3958             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3959         }
3960     }
3961   coding->result = CODING_RESULT_SUCCESS;
3962   coding->produced_char += produced_chars;
3963   coding->produced = dst - coding->destination;
3964   return 0;
3965 }
3966
3967 static int
3968 encode_coding_big5 (coding)
3969      struct coding_system *coding;
3970 {
3971   int multibytep = coding->dst_multibyte;
3972   int *charbuf = coding->charbuf;
3973   int *charbuf_end = charbuf + coding->charbuf_used;
3974   unsigned char *dst = coding->destination + coding->produced;
3975   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3976   int safe_room = 4;
3977   int produced_chars = 0;
3978   Lisp_Object attrs, eol_type, charset_list, val;
3979   int ascii_compatible;
3980   struct charset *charset_roman, *charset_big5;
3981   int c;
3982
3983   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3984   val = charset_list;
3985   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3986   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3987   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3988
3989   while (charbuf < charbuf_end)
3990     {
3991       ASSURE_DESTINATION (safe_room);
3992       c = *charbuf++;
3993       /* Now encode the character C.  */
3994       if (ASCII_CHAR_P (c) && ascii_compatible)
3995         EMIT_ONE_ASCII_BYTE (c);
3996       else
3997         {
3998           unsigned code;
3999           struct charset *charset = char_charset (c, charset_list, &code);
4000
4001           if (! charset)
4002             {
4003               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4004                 {
4005                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4006                   charset = CHARSET_FROM_ID (charset_ascii);
4007                 }
4008               else
4009                 {
4010                   c = coding->default_char;
4011                   charset = char_charset (c, charset_list, &code);
4012                 }
4013             }
4014           if (code == CHARSET_INVALID_CODE (charset))
4015             abort ();
4016           if (charset == charset_big5)
4017             {
4018               int c1, c2;
4019
4020               c1 = code >> 8, c2 = code & 0xFF;
4021               EMIT_TWO_BYTES (c1, c2);
4022             }
4023           else
4024             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4025         }
4026     }
4027   coding->result = CODING_RESULT_SUCCESS;
4028   coding->produced_char += produced_chars;
4029   coding->produced = dst - coding->destination;
4030   return 0;
4031 }
4032
4033 \f
4034 /*** 10. CCL handlers ***/
4035
4036 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4037    Check if a text is encoded in a coding system of which
4038    encoder/decoder are written in CCL program.  If it is, return
4039    CATEGORY_MASK_CCL, else return 0.  */
4040
4041 static int
4042 detect_coding_ccl (coding, mask)
4043      struct coding_system *coding;
4044      int *mask;
4045 {
4046   unsigned char *src = coding->source, *src_base = src;
4047   unsigned char *src_end = coding->source + coding->src_bytes;
4048   int multibytep = coding->src_multibyte;
4049   int consumed_chars = 0;
4050   int found = 0;
4051   unsigned char *valids = CODING_CCL_VALIDS (coding);
4052   int head_ascii = coding->head_ascii;
4053   Lisp_Object attrs;
4054
4055   coding = &coding_categories[coding_category_ccl];
4056   attrs = CODING_ID_ATTRS (coding->id);
4057   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4058     src += head_ascii;
4059
4060   while (1)
4061     {
4062       int c;
4063       ONE_MORE_BYTE (c);
4064       if (! valids[c])
4065         break;
4066       if (!found && valids[c] > 1)
4067         found = 1;
4068     }
4069   *mask &= ~CATEGORY_MASK_CCL;
4070   return 0;
4071
4072  no_more_source:
4073   if (!found)
4074     return 0;
4075   *mask &= CATEGORY_MASK_CCL;
4076   return 1;
4077 }
4078
4079 static void
4080 decode_coding_ccl (coding)
4081      struct coding_system *coding;
4082 {
4083   unsigned char *src = coding->source + coding->consumed;
4084   unsigned char *src_end = coding->source + coding->src_bytes;
4085   int *charbuf = coding->charbuf;
4086   int *charbuf_end = charbuf + coding->charbuf_size;
4087   int consumed_chars = 0;
4088   int multibytep = coding->src_multibyte;
4089   struct ccl_program ccl;
4090   int source_charbuf[1024];
4091   int source_byteidx[1024];
4092
4093   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4094
4095   while (src < src_end)
4096     {
4097       unsigned char *p = src;
4098       int *source, *source_end;
4099       int i = 0;
4100
4101       if (multibytep)
4102         while (i < 1024 && p < src_end)
4103           {
4104             source_byteidx[i] = p - src;
4105             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4106           }
4107       else
4108         while (i < 1024 && p < src_end)
4109           source_charbuf[i++] = *p++;
4110
4111       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4112         ccl.last_block = 1;
4113
4114       source = source_charbuf;
4115       source_end = source + i;
4116       while (source < source_end)
4117         {
4118           ccl_driver (&ccl, source, charbuf,
4119                       source_end - source, charbuf_end - charbuf);
4120           source += ccl.consumed;
4121           charbuf += ccl.produced;
4122           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4123             break;
4124         }
4125       if (source < source_end)
4126         src += source_byteidx[source - source_charbuf];
4127       else
4128         src = p;
4129       consumed_chars += source - source_charbuf;
4130
4131       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4132           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4133         break;
4134     }
4135
4136   switch (ccl.status)
4137     {
4138     case CCL_STAT_SUSPEND_BY_SRC:
4139       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4140       break;
4141     case CCL_STAT_SUSPEND_BY_DST:
4142       break;
4143     case CCL_STAT_QUIT:
4144     case CCL_STAT_INVALID_CMD:
4145       coding->result = CODING_RESULT_INTERRUPT;
4146       break;
4147     default:
4148       coding->result = CODING_RESULT_SUCCESS;
4149       break;
4150     }
4151   coding->consumed_char += consumed_chars;
4152   coding->consumed = src - coding->source;
4153   coding->charbuf_used = charbuf - coding->charbuf;
4154 }
4155
4156 static int
4157 encode_coding_ccl (coding)
4158      struct coding_system *coding;
4159 {
4160   struct ccl_program ccl;
4161   int multibytep = coding->dst_multibyte;
4162   int *charbuf = coding->charbuf;
4163   int *charbuf_end = charbuf + coding->charbuf_used;
4164   unsigned char *dst = coding->destination + coding->produced;
4165   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4166   unsigned char *adjusted_dst_end = dst_end - 1;
4167   int destination_charbuf[1024];
4168   int i, produced_chars = 0;
4169
4170   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4171
4172   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4173   ccl.dst_multibyte = coding->dst_multibyte;
4174
4175   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4176     {
4177       int dst_bytes = dst_end - dst;
4178       if (dst_bytes > 1024)
4179         dst_bytes = 1024;
4180
4181       ccl_driver (&ccl, charbuf, destination_charbuf,
4182                   charbuf_end - charbuf, dst_bytes);
4183       charbuf += ccl.consumed;
4184       if (multibytep)
4185         for (i = 0; i < ccl.produced; i++)
4186           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4187       else
4188         {
4189           for (i = 0; i < ccl.produced; i++)
4190             *dst++ = destination_charbuf[i] & 0xFF;
4191           produced_chars += ccl.produced;
4192         }
4193     }
4194
4195   switch (ccl.status)
4196     {
4197     case CCL_STAT_SUSPEND_BY_SRC:
4198       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4199       break;
4200     case CCL_STAT_SUSPEND_BY_DST:
4201       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4202       break;
4203     case CCL_STAT_QUIT:
4204     case CCL_STAT_INVALID_CMD:
4205       coding->result = CODING_RESULT_INTERRUPT;
4206       break;
4207     default:
4208       coding->result = CODING_RESULT_SUCCESS;
4209       break;
4210     }
4211
4212   coding->produced_char += produced_chars;
4213   coding->produced = dst - coding->destination;
4214   return 0;
4215 }
4216
4217
4218 \f
4219 /*** 10, 11. no-conversion handlers ***/
4220
4221 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4222
4223 static void
4224 decode_coding_raw_text (coding)
4225      struct coding_system *coding;
4226 {
4227   coding->chars_at_source = 1;
4228   coding->consumed_char = 0;
4229   coding->consumed = 0;
4230   coding->result = CODING_RESULT_SUCCESS;
4231 }
4232
4233 static int
4234 encode_coding_raw_text (coding)
4235      struct coding_system *coding;
4236 {
4237   int multibytep = coding->dst_multibyte;
4238   int *charbuf = coding->charbuf;
4239   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4240   unsigned char *dst = coding->destination + coding->produced;
4241   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4242   int produced_chars = 0;
4243   int c;
4244
4245   if (multibytep)
4246     {
4247       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4248
4249       if (coding->src_multibyte)
4250         while (charbuf < charbuf_end)
4251           {
4252             ASSURE_DESTINATION (safe_room);
4253             c = *charbuf++;
4254             if (ASCII_CHAR_P (c))
4255               EMIT_ONE_ASCII_BYTE (c);
4256             else if (CHAR_BYTE8_P (c))
4257               {
4258                 c = CHAR_TO_BYTE8 (c);
4259                 EMIT_ONE_BYTE (c);
4260               }
4261             else
4262               {
4263                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4264
4265                 CHAR_STRING_ADVANCE (c, p1);
4266                 while (p0 < p1)
4267                   EMIT_ONE_BYTE (*p0);
4268               }
4269           }
4270       else
4271         while (charbuf < charbuf_end)
4272           {
4273             ASSURE_DESTINATION (safe_room);
4274             c = *charbuf++;
4275             EMIT_ONE_BYTE (c);
4276           }
4277     }
4278   else
4279     {
4280       if (coding->src_multibyte)
4281         {
4282           int safe_room = MAX_MULTIBYTE_LENGTH;
4283
4284           while (charbuf < charbuf_end)
4285             {
4286               ASSURE_DESTINATION (safe_room);
4287               c = *charbuf++;
4288               if (ASCII_CHAR_P (c))
4289                 *dst++ = c;
4290               else if (CHAR_BYTE8_P (c))
4291                 *dst++ = CHAR_TO_BYTE8 (c);
4292               else
4293                 CHAR_STRING_ADVANCE (c, dst);
4294               produced_chars++;
4295             }
4296         }
4297       else
4298         {
4299           ASSURE_DESTINATION (charbuf_end - charbuf);
4300           while (charbuf < charbuf_end && dst < dst_end)
4301             *dst++ = *charbuf++;
4302           produced_chars = dst - (coding->destination + coding->dst_bytes);
4303         }
4304     }
4305   coding->result = CODING_RESULT_SUCCESS;
4306   coding->produced_char += produced_chars;
4307   coding->produced = dst - coding->destination;
4308   return 0;
4309 }
4310
4311 static int
4312 detect_coding_charset (coding, mask)
4313      struct coding_system *coding;
4314      int *mask;
4315 {
4316   unsigned char *src = coding->source, *src_base = src;
4317   unsigned char *src_end = coding->source + coding->src_bytes;
4318   int multibytep = coding->src_multibyte;
4319   int consumed_chars = 0;
4320   Lisp_Object attrs, valids;
4321
4322   coding = &coding_categories[coding_category_charset];
4323   attrs = CODING_ID_ATTRS (coding->id);
4324   valids = AREF (attrs, coding_attr_charset_valids);
4325
4326   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4327     src += coding->head_ascii;
4328
4329   while (1)
4330     {
4331       int c;
4332
4333       ONE_MORE_BYTE (c);
4334       if (NILP (AREF (valids, c)))
4335         break;
4336     }
4337   *mask &= ~CATEGORY_MASK_CHARSET;
4338   return 0;
4339
4340  no_more_source:
4341   *mask &= CATEGORY_MASK_CHARSET;
4342   return 1;
4343 }
4344
4345 static void
4346 decode_coding_charset (coding)
4347      struct coding_system *coding;
4348 {
4349   unsigned char *src = coding->source + coding->consumed;
4350   unsigned char *src_end = coding->source + coding->src_bytes;
4351   unsigned char *src_base;
4352   int *charbuf = coding->charbuf;
4353   int *charbuf_end = charbuf + coding->charbuf_size;
4354   int consumed_chars = 0, consumed_chars_base;
4355   int multibytep = coding->src_multibyte;
4356   Lisp_Object attrs, eol_type, charset_list, valids;
4357
4358   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4359   valids = AREF (attrs, coding_attr_charset_valids);
4360
4361   while (1)
4362     {
4363       int c;
4364
4365       src_base = src;
4366       consumed_chars_base = consumed_chars;
4367
4368       if (charbuf >= charbuf_end)
4369         break;
4370
4371       ONE_MORE_BYTE (c);
4372       if (c == '\r')
4373         {
4374           /* Here we assume that no charset maps '\r' to something
4375              else.  */
4376           if (EQ (eol_type, Qdos))
4377             {
4378               if (src < src_end
4379                   && *src == '\n')
4380                 ONE_MORE_BYTE (c);
4381             }
4382           else if (EQ (eol_type, Qmac))
4383             c = '\n';
4384         }
4385       else
4386         {
4387           Lisp_Object val;
4388           struct charset *charset;
4389           int dim;
4390           int len = 1;
4391           unsigned code = c;
4392
4393           val = AREF (valids, c);
4394           if (NILP (val))
4395             goto invalid_code;
4396           if (INTEGERP (val))
4397             {
4398               charset = CHARSET_FROM_ID (XFASTINT (val));
4399               dim = CHARSET_DIMENSION (charset);
4400               while (len < dim)
4401                 {
4402                   ONE_MORE_BYTE (c);
4403                   code = (code << 8) | c;
4404                   len++;
4405                 }
4406               CODING_DECODE_CHAR (coding, src, src_base, src_end,
4407                                   charset, code, c);
4408             }
4409           else
4410             {
4411               /* VAL is a list of charset IDs.  It is assured that the
4412                  list is sorted by charset dimensions (smaller one
4413                  comes first).  */
4414               while (CONSP (val))
4415                 {
4416                   charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4417                   dim = CHARSET_DIMENSION (charset);
4418                   while (len < dim)
4419                     {
4420                       ONE_MORE_BYTE (c);
4421                       code = (code << 8) | c;
4422                       len++;
4423                     }
4424                   CODING_DECODE_CHAR (coding, src, src_base,
4425                                       src_end, charset, code, c);
4426                   if (c >= 0)
4427                     break;
4428                   val = XCDR (val);
4429                 }
4430             }
4431           if (c < 0)
4432             goto invalid_code;
4433         }
4434       *charbuf++ = c;
4435       continue;
4436
4437     invalid_code:
4438       src = src_base;
4439       consumed_chars = consumed_chars_base;
4440       ONE_MORE_BYTE (c);
4441       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4442       coding->errors++;
4443     }
4444
4445  no_more_source:
4446   coding->consumed_char += consumed_chars_base;
4447   coding->consumed = src_base - coding->source;
4448   coding->charbuf_used = charbuf - coding->charbuf;
4449 }
4450
4451 static int
4452 encode_coding_charset (coding)
4453      struct coding_system *coding;
4454 {
4455   int multibytep = coding->dst_multibyte;
4456   int *charbuf = coding->charbuf;
4457   int *charbuf_end = charbuf + coding->charbuf_used;
4458   unsigned char *dst = coding->destination + coding->produced;
4459   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4460   int safe_room = MAX_MULTIBYTE_LENGTH;
4461   int produced_chars = 0;
4462   Lisp_Object attrs, eol_type, charset_list;
4463   int ascii_compatible;
4464   int c;
4465
4466   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4467   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4468
4469   while (charbuf < charbuf_end)
4470     {
4471       struct charset *charset;
4472       unsigned code;
4473
4474       ASSURE_DESTINATION (safe_room);
4475       c = *charbuf++;
4476       if (ascii_compatible && ASCII_CHAR_P (c))
4477         EMIT_ONE_ASCII_BYTE (c);
4478       else
4479         {
4480           charset = char_charset (c, charset_list, &code);
4481           if (charset)
4482             {
4483               if (CHARSET_DIMENSION (charset) == 1)
4484                 EMIT_ONE_BYTE (code);
4485               else if (CHARSET_DIMENSION (charset) == 2)
4486                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4487               else if (CHARSET_DIMENSION (charset) == 3)
4488                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4489               else
4490                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4491                                  (code >> 8) & 0xFF, code & 0xFF);
4492             }
4493           else
4494             {
4495               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4496                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4497               else
4498                 c = coding->default_char;
4499               EMIT_ONE_BYTE (c);
4500             }
4501         }
4502     }
4503
4504   coding->result = CODING_RESULT_SUCCESS;
4505   coding->produced_char += produced_chars;
4506   coding->produced = dst - coding->destination;
4507   return 0;
4508 }
4509
4510 \f
4511 /*** 7. C library functions ***/
4512
4513 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4514    has a property `coding-system'.  The value of this property is a
4515    vector of length 5 (called as coding-vector).  Among elements of
4516    this vector, the first (element[0]) and the fifth (element[4])
4517    carry important information for decoding/encoding.  Before
4518    decoding/encoding, this information should be set in fields of a
4519    structure of type `coding_system'.
4520
4521    A value of property `coding-system' can be a symbol of another
4522    subsidiary coding-system.  In that case, Emacs gets coding-vector
4523    from that symbol.
4524
4525    `element[0]' contains information to be set in `coding->type'.  The
4526    value and its meaning is as follows:
4527
4528    0 -- coding_type_emacs_mule
4529    1 -- coding_type_sjis
4530    2 -- coding_type_iso_2022
4531    3 -- coding_type_big5
4532    4 -- coding_type_ccl encoder/decoder written in CCL
4533    nil -- coding_type_no_conversion
4534    t -- coding_type_undecided (automatic conversion on decoding,
4535                                no-conversion on encoding)
4536
4537    `element[4]' contains information to be set in `coding->flags' and
4538    `coding->spec'.  The meaning varies by `coding->type'.
4539
4540    If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4541    of length 32 (of which the first 13 sub-elements are used now).
4542    Meanings of these sub-elements are:
4543
4544    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4545         If the value is an integer of valid charset, the charset is
4546         assumed to be designated to graphic register N initially.
4547
4548         If the value is minus, it is a minus value of charset which
4549         reserves graphic register N, which means that the charset is
4550         not designated initially but should be designated to graphic
4551         register N just before encoding a character in that charset.
4552
4553         If the value is nil, graphic register N is never used on
4554         encoding.
4555
4556    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4557         Each value takes t or nil.  See the section ISO2022 of
4558         `coding.h' for more information.
4559
4560    If `coding->type' is `coding_type_big5', element[4] is t to denote
4561    BIG5-ETen or nil to denote BIG5-HKU.
4562
4563    If `coding->type' takes the other value, element[4] is ignored.
4564
4565    Emacs Lisp's coding system also carries information about format of
4566    end-of-line in a value of property `eol-type'.  If the value is
4567    integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr.  If
4568    it is not integer, it should be a vector of subsidiary coding
4569    systems of which property `eol-type' has one of above values.
4570
4571 */
4572
4573 /* Setup coding context CODING from information about CODING_SYSTEM.
4574    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4575    CODING_SYSTEM is invalid, signal an error.  */
4576
4577 void
4578 setup_coding_system (coding_system, coding)
4579      Lisp_Object coding_system;
4580      struct coding_system *coding;
4581 {
4582   Lisp_Object attrs;
4583   Lisp_Object eol_type;
4584   Lisp_Object coding_type;
4585   Lisp_Object val;
4586
4587   if (NILP (coding_system))
4588     coding_system = Qno_conversion;
4589
4590   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4591
4592   attrs = CODING_ID_ATTRS (coding->id);
4593   eol_type = CODING_ID_EOL_TYPE (coding->id);
4594
4595   coding->mode = 0;
4596   coding->head_ascii = -1;
4597   coding->common_flags
4598     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4599
4600   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4601   coding->max_charset_id = XSTRING (val)->size - 1;
4602   coding->safe_charsets = (char *) XSTRING (val)->data;
4603   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4604
4605   coding_type = CODING_ATTR_TYPE (attrs);
4606   if (EQ (coding_type, Qundecided))
4607     {
4608       coding->detector = NULL;
4609       coding->decoder = decode_coding_raw_text;
4610       coding->encoder = encode_coding_raw_text;
4611       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4612     }
4613   else if (EQ (coding_type, Qiso_2022))
4614     {
4615       int i;
4616       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4617
4618       /* Invoke graphic register 0 to plane 0.  */
4619       CODING_ISO_INVOCATION (coding, 0) = 0;
4620       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4621       CODING_ISO_INVOCATION (coding, 1)
4622         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4623       /* Setup the initial status of designation.  */
4624       for (i = 0; i < 4; i++)
4625         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4626       /* Not single shifting initially.  */
4627       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4628       /* Beginning of buffer should also be regarded as bol. */
4629       CODING_ISO_BOL (coding) = 1;
4630       coding->detector = detect_coding_iso_2022;
4631       coding->decoder = decode_coding_iso_2022;
4632       coding->encoder = encode_coding_iso_2022;
4633       if (flags & CODING_ISO_FLAG_SAFE)
4634         coding->mode |= CODING_MODE_SAFE_ENCODING;
4635       coding->common_flags
4636         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4637             | CODING_REQUIRE_FLUSHING_MASK);
4638       if (flags & CODING_ISO_FLAG_COMPOSITION)
4639         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4640       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4641         {
4642           setup_iso_safe_charsets (attrs);
4643           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4644           coding->max_charset_id = XSTRING (val)->size - 1;
4645           coding->safe_charsets = (char *) XSTRING (val)->data;
4646         }
4647       CODING_ISO_FLAGS (coding) = flags;
4648     }
4649   else if (EQ (coding_type, Qcharset))
4650     {
4651       coding->detector = detect_coding_charset;
4652       coding->decoder = decode_coding_charset;
4653       coding->encoder = encode_coding_charset;
4654       coding->common_flags
4655         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4656     }
4657   else if (EQ (coding_type, Qutf_8))
4658     {
4659       coding->detector = detect_coding_utf_8;
4660       coding->decoder = decode_coding_utf_8;
4661       coding->encoder = encode_coding_utf_8;
4662       coding->common_flags
4663         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4664     }
4665   else if (EQ (coding_type, Qutf_16))
4666     {
4667       val = AREF (attrs, coding_attr_utf_16_bom);
4668       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4669                                     : EQ (val, Qt) ? utf_16_with_bom
4670                                     : utf_16_without_bom);
4671       val = AREF (attrs, coding_attr_utf_16_endian);
4672       CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4673                                        : utf_16_little_endian);
4674       CODING_UTF_16_SURROGATE (coding) = 0;
4675       coding->detector = detect_coding_utf_16;
4676       coding->decoder = decode_coding_utf_16;
4677       coding->encoder = encode_coding_utf_16;
4678       coding->common_flags
4679         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4680     }
4681   else if (EQ (coding_type, Qccl))
4682     {
4683       coding->detector = detect_coding_ccl;
4684       coding->decoder = decode_coding_ccl;
4685       coding->encoder = encode_coding_ccl;
4686       coding->common_flags
4687         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4688             | CODING_REQUIRE_FLUSHING_MASK);
4689     }
4690   else if (EQ (coding_type, Qemacs_mule))
4691     {
4692       coding->detector = detect_coding_emacs_mule;
4693       coding->decoder = decode_coding_emacs_mule;
4694       coding->encoder = encode_coding_emacs_mule;
4695       coding->common_flags
4696         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4697       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4698           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4699         {
4700           Lisp_Object tail, safe_charsets;
4701           int max_charset_id = 0;
4702
4703           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4704                tail = XCDR (tail))
4705             if (max_charset_id < XFASTINT (XCAR (tail)))
4706               max_charset_id = XFASTINT (XCAR (tail));
4707           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4708                                         make_number (255));
4709           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4710                tail = XCDR (tail))
4711             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4712           coding->max_charset_id = max_charset_id;
4713           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4714         }
4715     }
4716   else if (EQ (coding_type, Qshift_jis))
4717     {
4718       coding->detector = detect_coding_sjis;
4719       coding->decoder = decode_coding_sjis;
4720       coding->encoder = encode_coding_sjis;
4721       coding->common_flags
4722         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4723     }
4724   else if (EQ (coding_type, Qbig5))
4725     {
4726       coding->detector = detect_coding_big5;
4727       coding->decoder = decode_coding_big5;
4728       coding->encoder = encode_coding_big5;
4729       coding->common_flags
4730         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4731     }
4732   else                          /* EQ (coding_type, Qraw_text) */
4733     {
4734       coding->detector = NULL;
4735       coding->decoder = decode_coding_raw_text;
4736       coding->encoder = encode_coding_raw_text;
4737       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4738     }
4739
4740   return;
4741 }
4742
4743 /* Return raw-text or one of its subsidiaries that has the same
4744    eol_type as CODING-SYSTEM.  */
4745
4746 Lisp_Object
4747 raw_text_coding_system (coding_system)
4748      Lisp_Object coding_system;
4749 {
4750   Lisp_Object spec, attrs;
4751   Lisp_Object eol_type, raw_text_eol_type;
4752
4753   spec = CODING_SYSTEM_SPEC (coding_system);
4754   attrs = AREF (spec, 0);
4755
4756   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4757     return coding_system;
4758
4759   eol_type = AREF (spec, 2);
4760   if (VECTORP (eol_type))
4761     return Qraw_text;
4762   spec = CODING_SYSTEM_SPEC (Qraw_text);
4763   raw_text_eol_type = AREF (spec, 2);
4764   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4765           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4766           : AREF (raw_text_eol_type, 2));
4767 }
4768
4769
4770 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4771    does, return one of the subsidiary that has the same eol-spec as
4772    PARENT.  Otherwise, return CODING_SYSTEM.  */
4773
4774 Lisp_Object
4775 coding_inherit_eol_type (coding_system, parent)
4776      Lisp_Object coding_system, parent;
4777 {
4778   Lisp_Object spec, attrs, eol_type;
4779
4780   spec = CODING_SYSTEM_SPEC (coding_system);
4781   attrs = AREF (spec, 0);
4782   eol_type = AREF (spec, 2);
4783   if (VECTORP (eol_type))
4784     {
4785       Lisp_Object parent_spec;
4786       Lisp_Object parent_eol_type;
4787
4788       parent_spec
4789         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4790       parent_eol_type = AREF (parent_spec, 2);
4791       if (EQ (parent_eol_type, Qunix))
4792         coding_system = AREF (eol_type, 0);
4793       else if (EQ (parent_eol_type, Qdos))
4794         coding_system = AREF (eol_type, 1);
4795       else if (EQ (parent_eol_type, Qmac))
4796         coding_system = AREF (eol_type, 2);
4797     }
4798   return coding_system;
4799 }
4800
4801 /* Emacs has a mechanism to automatically detect a coding system if it
4802    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4803    it's impossible to distinguish some coding systems accurately
4804    because they use the same range of codes.  So, at first, coding
4805    systems are categorized into 7, those are:
4806
4807    o coding-category-emacs-mule
4808
4809         The category for a coding system which has the same code range
4810         as Emacs' internal format.  Assigned the coding-system (Lisp
4811         symbol) `emacs-mule' by default.
4812
4813    o coding-category-sjis
4814
4815         The category for a coding system which has the same code range
4816         as SJIS.  Assigned the coding-system (Lisp
4817         symbol) `japanese-shift-jis' by default.
4818
4819    o coding-category-iso-7
4820
4821         The category for a coding system which has the same code range
4822         as ISO2022 of 7-bit environment.  This doesn't use any locking
4823         shift and single shift functions.  This can encode/decode all
4824         charsets.  Assigned the coding-system (Lisp symbol)
4825         `iso-2022-7bit' by default.
4826
4827    o coding-category-iso-7-tight
4828
4829         Same as coding-category-iso-7 except that this can
4830         encode/decode only the specified charsets.
4831
4832    o coding-category-iso-8-1
4833
4834         The category for a coding system which has the same code range
4835         as ISO2022 of 8-bit environment and graphic plane 1 used only
4836         for DIMENSION1 charset.  This doesn't use any locking shift
4837         and single shift functions.  Assigned the coding-system (Lisp
4838         symbol) `iso-latin-1' by default.
4839
4840    o coding-category-iso-8-2
4841
4842         The category for a coding system which has the same code range
4843         as ISO2022 of 8-bit environment and graphic plane 1 used only
4844         for DIMENSION2 charset.  This doesn't use any locking shift
4845         and single shift functions.  Assigned the coding-system (Lisp
4846         symbol) `japanese-iso-8bit' by default.
4847
4848    o coding-category-iso-7-else
4849
4850         The category for a coding system which has the same code range
4851         as ISO2022 of 7-bit environemnt but uses locking shift or
4852         single shift functions.  Assigned the coding-system (Lisp
4853         symbol) `iso-2022-7bit-lock' by default.
4854
4855    o coding-category-iso-8-else
4856
4857         The category for a coding system which has the same code range
4858         as ISO2022 of 8-bit environemnt but uses locking shift or
4859         single shift functions.  Assigned the coding-system (Lisp
4860         symbol) `iso-2022-8bit-ss2' by default.
4861
4862    o coding-category-big5
4863
4864         The category for a coding system which has the same code range
4865         as BIG5.  Assigned the coding-system (Lisp symbol)
4866         `cn-big5' by default.
4867
4868    o coding-category-utf-8
4869
4870         The category for a coding system which has the same code range
4871         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4872         symbol) `utf-8' by default.
4873
4874    o coding-category-utf-16-be
4875
4876         The category for a coding system in which a text has an
4877         Unicode signature (cf. Unicode Standard) in the order of BIG
4878         endian at the head.  Assigned the coding-system (Lisp symbol)
4879         `utf-16-be' by default.
4880
4881    o coding-category-utf-16-le
4882
4883         The category for a coding system in which a text has an
4884         Unicode signature (cf. Unicode Standard) in the order of
4885         LITTLE endian at the head.  Assigned the coding-system (Lisp
4886         symbol) `utf-16-le' by default.
4887
4888    o coding-category-ccl
4889
4890         The category for a coding system of which encoder/decoder is
4891         written in CCL programs.  The default value is nil, i.e., no
4892         coding system is assigned.
4893
4894    o coding-category-binary
4895
4896         The category for a coding system not categorized in any of the
4897         above.  Assigned the coding-system (Lisp symbol)
4898         `no-conversion' by default.
4899
4900    Each of them is a Lisp symbol and the value is an actual
4901    `coding-system's (this is also a Lisp symbol) assigned by a user.
4902    What Emacs does actually is to detect a category of coding system.
4903    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4904    decide only one possible category, it selects a category of the
4905    highest priority.  Priorities of categories are also specified by a
4906    user in a Lisp variable `coding-category-list'.
4907
4908 */
4909
4910 #define EOL_SEEN_NONE   0
4911 #define EOL_SEEN_LF     1
4912 #define EOL_SEEN_CR     2
4913 #define EOL_SEEN_CRLF   4
4914
4915 /* Detect how end-of-line of a text of length CODING->src_bytes
4916    pointed by CODING->source is encoded.  Return one of
4917    EOL_SEEN_XXX.  */
4918
4919 #define MAX_EOL_CHECK_COUNT 3
4920
4921 static int
4922 detect_eol (coding, source, src_bytes)
4923      struct coding_system *coding;
4924      unsigned char *source;
4925      EMACS_INT src_bytes;
4926 {
4927   Lisp_Object attrs, coding_type;
4928   unsigned char *src = source, *src_end = src + src_bytes;
4929   unsigned char c;
4930   int total  = 0;
4931   int eol_seen = EOL_SEEN_NONE;
4932
4933   attrs = CODING_ID_ATTRS (coding->id);
4934   coding_type = CODING_ATTR_TYPE (attrs);
4935
4936   if (EQ (coding_type, Qccl))
4937     {
4938       int msb, lsb;
4939
4940       msb = coding->spec.utf_16.endian == utf_16_little_endian;
4941       lsb = 1 - msb;
4942
4943       while (src + 1 < src_end)
4944         {
4945           c = src[lsb];
4946           if (src[msb] == 0 && (c == '\n' || c == '\r'))
4947             {
4948               int this_eol;
4949
4950               if (c == '\n')
4951                 this_eol = EOL_SEEN_LF;
4952               else if (src + 3 >= src_end
4953                        || src[msb + 2] != 0
4954                        || src[lsb + 2] != '\n')
4955                 this_eol = EOL_SEEN_CR;
4956               else
4957                 this_eol = EOL_SEEN_CRLF;
4958
4959               if (eol_seen == EOL_SEEN_NONE)
4960                 /* This is the first end-of-line.  */
4961                 eol_seen = this_eol;
4962               else if (eol_seen != this_eol)
4963                 {
4964                   /* The found type is different from what found before.  */
4965                   eol_seen = EOL_SEEN_LF;
4966                   break;
4967                 }
4968               if (++total == MAX_EOL_CHECK_COUNT)
4969                 break;
4970             }
4971           src += 2;
4972         }
4973     }
4974   else
4975     {
4976       while (src < src_end)
4977         {
4978           c = *src++;
4979           if (c == '\n' || c == '\r')
4980             {
4981               int this_eol;
4982
4983               if (c == '\n')
4984                 this_eol = EOL_SEEN_LF;
4985               else if (src >= src_end || *src != '\n')
4986                 this_eol = EOL_SEEN_CR;
4987               else
4988                 this_eol = EOL_SEEN_CRLF, src++;
4989
4990               if (eol_seen == EOL_SEEN_NONE)
4991                 /* This is the first end-of-line.  */
4992                 eol_seen = this_eol;
4993               else if (eol_seen != this_eol)
4994                 {
4995                   /* The found type is different from what found before.  */
4996                   eol_seen = EOL_SEEN_LF;
4997                   break;
4998                 }
4999               if (++total == MAX_EOL_CHECK_COUNT)
5000                 break;
5001             }
5002         }
5003     }
5004   return eol_seen;
5005 }
5006
5007
5008 static void
5009 adjust_coding_eol_type (coding, eol_seen)
5010      struct coding_system *coding;
5011      int eol_seen;
5012 {
5013   Lisp_Object eol_type;
5014
5015   eol_type = CODING_ID_EOL_TYPE (coding->id);
5016   if (eol_seen & EOL_SEEN_LF)
5017     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5018   else if (eol_type & EOL_SEEN_CRLF)
5019     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5020   else if (eol_type & EOL_SEEN_CR)
5021     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5022 }
5023
5024 /* Detect how a text specified in CODING is encoded.  If a coding
5025    system is detected, update fields of CODING by the detected coding
5026    system.  */
5027
5028 void
5029 detect_coding (coding)
5030      struct coding_system *coding;
5031 {
5032   unsigned char *src, *src_end;
5033   Lisp_Object attrs, coding_type;
5034
5035   coding->consumed = coding->consumed_char = 0;
5036   coding->produced = coding->produced_char = 0;
5037   coding_set_source (coding);
5038
5039   src_end = coding->source + coding->src_bytes;
5040
5041   /* If we have not yet decided the text encoding type, detect it
5042      now.  */
5043   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5044     {
5045       int mask = CATEGORY_MASK_ANY;
5046       int c, i;
5047
5048       for (src = coding->source; src < src_end; src++)
5049         {
5050           c = *src;
5051           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5052                                         || c == ISO_CODE_SI
5053                                         || c == ISO_CODE_SO)))
5054             break;
5055         }
5056       coding->head_ascii = src - (coding->source + coding->consumed);
5057
5058       if (coding->head_ascii < coding->src_bytes)
5059         {
5060           int detected = 0;
5061
5062           for (i = 0; i < coding_category_raw_text; i++)
5063             {
5064               enum coding_category category = coding_priorities[i];
5065               struct coding_system *this = coding_categories + category;
5066
5067               if (category >= coding_category_raw_text
5068                   || detected & (1 << category))
5069                 continue;
5070
5071               if (this->id < 0)
5072                 {
5073                   /* No coding system of this category is defined.  */
5074                   mask &= ~(1 << category);
5075                 }
5076               else
5077                 {
5078                   detected |= detected_mask[category];
5079                   if ((*(this->detector)) (coding, &mask))
5080                     break;
5081                 }
5082             }
5083           if (! mask)
5084             setup_coding_system (Qraw_text, coding);
5085           else if (mask != CATEGORY_MASK_ANY)
5086             for (i = 0; i < coding_category_raw_text; i++)
5087               {
5088                 enum coding_category category = coding_priorities[i];
5089                 struct coding_system *this = coding_categories + category;
5090
5091                 if (mask & (1 << category))
5092                   {
5093                     setup_coding_system (CODING_ID_NAME (this->id), coding);
5094                     break;
5095                   }
5096               }
5097         }
5098     }
5099
5100   attrs = CODING_ID_ATTRS (coding->id);
5101   coding_type = CODING_ATTR_TYPE (attrs);
5102
5103   /* If we have not yet decided the EOL type, detect it now.  But, the
5104      detection is impossible for a CCL based coding system, in which
5105      case, we detct the EOL type after decoding.  */
5106   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5107       && ! EQ (coding_type, Qccl))
5108     {
5109       int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5110
5111       if (eol_seen != EOL_SEEN_NONE)
5112         adjust_coding_eol_type (coding, eol_seen);
5113     }
5114 }
5115
5116
5117 static void
5118 decode_eol (coding)
5119      struct coding_system *coding;
5120 {
5121   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5122     {
5123       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5124       unsigned char *pend = p + coding->produced;
5125       int eol_seen = EOL_SEEN_NONE;
5126
5127       for (; p < pend; p++)
5128         {
5129           if (*p == '\n')
5130             eol_seen |= EOL_SEEN_LF;
5131           else if (*p == '\r')
5132             {
5133               if (p + 1 < pend && *(p + 1) == '\n')
5134                 {
5135                   eol_seen |= EOL_SEEN_CRLF;
5136                   p++;
5137                 }
5138               else
5139                 eol_seen |= EOL_SEEN_CR;
5140             }
5141         }
5142       if (eol_seen != EOL_SEEN_NONE)
5143         adjust_coding_eol_type (coding, eol_seen);
5144     }
5145
5146   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5147     {
5148       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5149       unsigned char *pend = p + coding->produced;
5150
5151       for (; p < pend; p++)
5152         if (*p == '\r')
5153           *p = '\n';
5154     }
5155   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5156     {
5157       unsigned char *p, *pbeg, *pend;
5158       Lisp_Object undo_list;
5159
5160       move_gap_both (coding->dst_pos + coding->produced_char,
5161                      coding->dst_pos_byte + coding->produced);
5162       undo_list = current_buffer->undo_list;
5163       current_buffer->undo_list = Qt;
5164       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5165       current_buffer->undo_list = undo_list;
5166       pbeg = GPT_ADDR;
5167       pend = pbeg + coding->produced;
5168
5169       for (p = pend - 1; p >= pbeg; p--)
5170         if (*p == '\r')
5171           {
5172             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5173             pend--;
5174           }
5175       coding->produced_char -= coding->produced - (pend - pbeg);
5176       coding->produced = pend - pbeg;
5177       insert_from_gap (coding->produced_char, coding->produced);
5178     }
5179 }
5180
5181 static void
5182 translate_chars (coding, table)
5183      struct coding_system *coding;
5184      Lisp_Object table;
5185 {
5186   int *charbuf = coding->charbuf;
5187   int *charbuf_end = charbuf + coding->charbuf_used;
5188   int c;
5189
5190   if (coding->chars_at_source)
5191     return;
5192
5193   while (charbuf < charbuf_end)
5194     {
5195       c = *charbuf;
5196       if (c < 0)
5197         charbuf += c;
5198       else
5199         *charbuf++ = translate_char (table, c);
5200     }
5201 }
5202
5203 static int
5204 produce_chars (coding)
5205      struct coding_system *coding;
5206 {
5207   unsigned char *dst = coding->destination + coding->produced;
5208   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5209   int produced;
5210   int produced_chars = 0;
5211
5212   if (! coding->chars_at_source)
5213     {
5214       /* Characters are in coding->charbuf.  */
5215       int *buf = coding->charbuf;
5216       int *buf_end = buf + coding->charbuf_used;
5217       unsigned char *adjusted_dst_end;
5218
5219       if (BUFFERP (coding->src_object)
5220           && EQ (coding->src_object, coding->dst_object))
5221         dst_end = coding->source + coding->consumed;
5222       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5223
5224       while (buf < buf_end)
5225         {
5226           int c = *buf++;
5227
5228           if (dst >= adjusted_dst_end)
5229             {
5230               dst = alloc_destination (coding,
5231                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5232                                        dst);
5233               dst_end = coding->destination + coding->dst_bytes;
5234               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5235             }
5236           if (c >= 0)
5237             {
5238               if (coding->dst_multibyte
5239                   || ! CHAR_BYTE8_P (c))
5240                 CHAR_STRING_ADVANCE (c, dst);
5241               else
5242                 *dst++ = CHAR_TO_BYTE8 (c);
5243               produced_chars++;
5244             }
5245           else
5246             /* This is an annotation data.  */
5247             buf -= c + 1;
5248         }
5249     }
5250   else
5251     {
5252       unsigned char *src = coding->source;
5253       unsigned char *src_end = src + coding->src_bytes;
5254       Lisp_Object eol_type;
5255
5256       eol_type = CODING_ID_EOL_TYPE (coding->id);
5257
5258       if (coding->src_multibyte != coding->dst_multibyte)
5259         {
5260           if (coding->src_multibyte)
5261             {
5262               int multibytep = 1;
5263               int consumed_chars;
5264
5265               while (1)
5266                 {
5267                   unsigned char *src_base = src;
5268                   int c;
5269
5270                   ONE_MORE_BYTE (c);
5271                   if (c == '\r')
5272                     {
5273                       if (EQ (eol_type, Qdos))
5274                         {
5275                           if (src < src_end
5276                               && *src == '\n')
5277                             c = *src++;
5278                         }
5279                       else if (EQ (eol_type, Qmac))
5280                         c = '\n';
5281                     }
5282                   if (dst == dst_end)
5283                     {
5284                       coding->consumed = src - coding->source;
5285
5286                     if (EQ (coding->src_object, coding->dst_object))
5287                       dst_end = src;
5288                     if (dst == dst_end)
5289                       {
5290                         dst = alloc_destination (coding, src_end - src + 1,
5291                                                  dst);
5292                         dst_end = coding->destination + coding->dst_bytes;
5293                         coding_set_source (coding);
5294                         src = coding->source + coding->consumed;
5295                         src_end = coding->source + coding->src_bytes;
5296                       }
5297                     }
5298                   *dst++ = c;
5299                   produced_chars++;
5300                 }
5301             no_more_source:
5302               ;
5303             }
5304           else
5305             while (src < src_end)
5306               {
5307                 int multibytep = 1;
5308                 int c = *src++;
5309
5310                 if (c == '\r')
5311                   {
5312                     if (EQ (eol_type, Qdos))
5313                       {
5314                         if (src < src_end
5315                             && *src == '\n')
5316                           c = *src++;
5317                       }
5318                     else if (EQ (eol_type, Qmac))
5319                       c = '\n';
5320                   }
5321                 if (dst >= dst_end - 1)
5322                   {
5323                     coding->consumed = src - coding->source;
5324
5325                     if (EQ (coding->src_object, coding->dst_object))
5326                       dst_end = src;
5327                     if (dst >= dst_end - 1)
5328                       {
5329                         dst = alloc_destination (coding, src_end - src + 2,
5330                                                  dst);
5331                         dst_end = coding->destination + coding->dst_bytes;
5332                         coding_set_source (coding);
5333                         src = coding->source + coding->consumed;
5334                         src_end = coding->source + coding->src_bytes;
5335                       }
5336                   }
5337                 EMIT_ONE_BYTE (c);
5338               }
5339         }
5340       else
5341         {
5342           if (!EQ (coding->src_object, coding->dst_object))
5343             {
5344               int require = coding->src_bytes - coding->dst_bytes;
5345
5346               if (require > 0)
5347                 {
5348                   EMACS_INT offset = src - coding->source;
5349
5350                   dst = alloc_destination (coding, require, dst);
5351                   coding_set_source (coding);
5352                   src = coding->source + offset;
5353                   src_end = coding->source + coding->src_bytes;
5354                 }
5355             }
5356           produced_chars = coding->src_chars;
5357           while (src < src_end)
5358             {
5359               int c = *src++;
5360
5361               if (c == '\r')
5362                 {
5363                   if (EQ (eol_type, Qdos))
5364                     {
5365                       if (src < src_end
5366                           && *src == '\n')
5367                         c = *src++;
5368                       produced_chars--;
5369                     }
5370                   else if (EQ (eol_type, Qmac))
5371                     c = '\n';
5372                 }
5373               *dst++ = c;
5374             }
5375         }
5376       coding->consumed = coding->src_bytes;
5377       coding->consumed_char = coding->src_chars;
5378     }
5379
5380   produced = dst - (coding->destination + coding->produced);
5381   if (BUFFERP (coding->dst_object))
5382     insert_from_gap (produced_chars, produced);
5383   coding->produced += produced;
5384   coding->produced_char += produced_chars;
5385   return produced_chars;
5386 }
5387
5388 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5389         or
5390    [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5391  */
5392
5393 static INLINE void
5394 produce_composition (coding, charbuf)
5395      struct coding_system *coding;
5396      int *charbuf;
5397 {
5398   Lisp_Object buffer;
5399   int len;
5400   EMACS_INT pos;
5401   enum composition_method method;
5402   int cmp_len;
5403   Lisp_Object components;
5404
5405   buffer = coding->dst_object;
5406   len = -charbuf[0];
5407   pos = coding->dst_pos + charbuf[1];
5408   method = (enum composition_method) (charbuf[3]);
5409   cmp_len = charbuf[4];
5410
5411   if (method == COMPOSITION_RELATIVE)
5412     components = Qnil;
5413   else
5414     {
5415       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5416       int i;
5417
5418       len -= 5;
5419       charbuf += 5;
5420       for (i = 0; i < len; i++)
5421         args[i] = make_number (charbuf[i]);
5422       components = (method == COMPOSITION_WITH_ALTCHARS
5423                     ? Fstring (len, args) : Fvector (len, args));
5424     }
5425   compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5426 }
5427
5428 static int *
5429 save_composition_data (buf, buf_end, prop)
5430      int *buf, *buf_end;
5431      Lisp_Object prop;
5432 {
5433   enum composition_method method = COMPOSITION_METHOD (prop);
5434   int cmp_len = COMPOSITION_LENGTH (prop);
5435
5436   if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5437     return NULL;
5438
5439   buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5440   buf[2] = method;
5441   buf[3] = cmp_len;
5442
5443   if (method == COMPOSITION_RELATIVE)
5444     buf[0] = 4;
5445   else
5446     {
5447       Lisp_Object components;
5448       int len, i;
5449
5450       components = COMPOSITION_COMPONENTS (prop);
5451       if (VECTORP (components))
5452         {
5453           len = XVECTOR (components)->size;
5454           for (i = 0; i < len; i++)
5455             buf[4 + i] = XINT (AREF (components, i));
5456         }
5457       else if (STRINGP (components))
5458         {
5459           int i_byte;
5460
5461           len = XSTRING (components)->size;
5462           i = i_byte = 0;
5463           while (i < len)
5464             FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5465         }
5466       else if (INTEGERP (components))
5467         {
5468           len = 1;
5469           buf[4] = XINT (components);
5470         }
5471       else if (CONSP (components))
5472         {
5473           for (len = 0; CONSP (components);
5474                len++, components = XCDR (components))
5475             buf[4 + len] = XINT (XCAR (components));
5476         }
5477       else
5478         abort ();
5479       buf[0] = 4 + len;
5480     }
5481   return (buf + buf[0]);
5482 }
5483
5484 #define CHARBUF_SIZE 0x4000
5485
5486 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5487   do {                                                                  \
5488     int size = CHARBUF_SIZE;;                                           \
5489                                                                         \
5490     coding->charbuf = NULL;                                             \
5491     while (size > 1024)                                                 \
5492       {                                                                 \
5493         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5494         if (coding->charbuf)                                            \
5495           break;                                                        \
5496         size >>= 1;                                                     \
5497       }                                                                 \
5498     if (! coding->charbuf)                                              \
5499       {                                                                 \
5500         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5501         return coding->result;                                          \
5502       }                                                                 \
5503     coding->charbuf_size = size;                                        \
5504   } while (0)
5505
5506
5507 static void
5508 produce_annotation (coding)
5509      struct coding_system *coding;
5510 {
5511   int *charbuf = coding->charbuf;
5512   int *charbuf_end = charbuf + coding->charbuf_used;
5513
5514   while (charbuf < charbuf_end)
5515     {
5516       if (*charbuf >= 0)
5517         charbuf++;
5518       else
5519         {
5520           int len = -*charbuf;
5521           switch (charbuf[2])
5522             {
5523             case CODING_ANNOTATE_COMPOSITION_MASK:
5524               produce_composition (coding, charbuf);
5525               break;
5526             default:
5527               abort ();
5528             }
5529           charbuf += len;
5530         }
5531     }
5532 }
5533
5534 /* Decode the data at CODING->src_object into CODING->dst_object.
5535    CODING->src_object is a buffer, a string, or nil.
5536    CODING->dst_object is a buffer.
5537
5538    If CODING->src_object is a buffer, it must be the current buffer.
5539    In this case, if CODING->src_pos is positive, it is a position of
5540    the source text in the buffer, otherwise, the source text is in the
5541    gap area of the buffer, and CODING->src_pos specifies the offset of
5542    the text from GPT (which must be the same as PT).  If this is the
5543    same buffer as CODING->dst_object, CODING->src_pos must be
5544    negative.
5545
5546    If CODING->src_object is a string, CODING->src_pos in an index to
5547    that string.
5548
5549    If CODING->src_object is nil, CODING->source must already point to
5550    the non-relocatable memory area.  In this case, CODING->src_pos is
5551    an offset from CODING->source.
5552
5553    The decoded data is inserted at the current point of the buffer
5554    CODING->dst_object.
5555 */
5556
5557 static int
5558 decode_coding (coding)
5559      struct coding_system *coding;
5560 {
5561   Lisp_Object attrs;
5562
5563   if (BUFFERP (coding->src_object)
5564       && coding->src_pos > 0
5565       && coding->src_pos < GPT
5566       && coding->src_pos + coding->src_chars > GPT)
5567     move_gap_both (coding->src_pos, coding->src_pos_byte);
5568
5569   if (BUFFERP (coding->dst_object))
5570     {
5571       if (current_buffer != XBUFFER (coding->dst_object))
5572         set_buffer_internal (XBUFFER (coding->dst_object));
5573       if (GPT != PT)
5574         move_gap_both (PT, PT_BYTE);
5575     }
5576
5577   coding->consumed = coding->consumed_char = 0;
5578   coding->produced = coding->produced_char = 0;
5579   coding->chars_at_source = 0;
5580   coding->result = CODING_RESULT_SUCCESS;
5581   coding->errors = 0;
5582
5583   ALLOC_CONVERSION_WORK_AREA (coding);
5584
5585   attrs = CODING_ID_ATTRS (coding->id);
5586
5587   do
5588     {
5589       coding_set_source (coding);
5590       coding->annotated = 0;
5591       (*(coding->decoder)) (coding);
5592       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5593         translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5594       coding_set_destination (coding);
5595       produce_chars (coding);
5596       if (coding->annotated)
5597         produce_annotation (coding);
5598     }
5599   while (coding->consumed < coding->src_bytes
5600          && ! coding->result);
5601
5602   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5603       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5604       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5605     decode_eol (coding);
5606
5607   coding->carryover_bytes = 0;
5608   if (coding->consumed < coding->src_bytes)
5609     {
5610       int nbytes = coding->src_bytes - coding->consumed;
5611       unsigned char *src;
5612
5613       coding_set_source (coding);
5614       coding_set_destination (coding);
5615       src = coding->source + coding->consumed;
5616
5617       if (coding->mode & CODING_MODE_LAST_BLOCK)
5618         {
5619           /* Flush out unprocessed data as binary chars.  We are sure
5620              that the number of data is less than the size of
5621              coding->charbuf.  */
5622           int *charbuf = coding->charbuf;
5623
5624           while (nbytes-- > 0)
5625             {
5626               int c = *src++;
5627               *charbuf++ =  (c & 0x80 ? - c : c);
5628             }
5629           produce_chars (coding);
5630         }
5631       else
5632         {
5633           /* Record unprocessed bytes in coding->carryover.  We are
5634              sure that the number of data is less than the size of
5635              coding->carryover.  */
5636           unsigned char *p = coding->carryover;
5637
5638           coding->carryover_bytes = nbytes;
5639           while (nbytes-- > 0)
5640             *p++ = *src++;
5641         }
5642       coding->consumed = coding->src_bytes;
5643     }
5644
5645   return coding->result;
5646 }
5647
5648 static void
5649 consume_chars (coding)
5650      struct coding_system *coding;
5651 {
5652   int *buf = coding->charbuf;
5653   /* -1 is to compensate for CRLF.  */
5654   int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5655   unsigned char *src = coding->source + coding->consumed;
5656   int pos = coding->src_pos + coding->consumed_char;
5657   int end_pos = coding->src_pos + coding->src_chars;
5658   int multibytep = coding->src_multibyte;
5659   Lisp_Object eol_type;
5660   int c;
5661   int start, end, stop;
5662   Lisp_Object object, prop;
5663
5664   eol_type = CODING_ID_EOL_TYPE (coding->id);
5665   if (VECTORP (eol_type))
5666     eol_type = Qunix;
5667
5668   object = coding->src_object;
5669
5670   /* Note: composition handling is not yet implemented.  */
5671   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5672
5673   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5674       && find_composition (pos, end_pos, &start, &end, &prop, object)
5675       && end <= end_pos
5676       && (start >= pos
5677           || (find_composition (end, end_pos, &start, &end, &prop, object)
5678               && end <= end_pos)))
5679     stop = start;
5680   else
5681     stop = end_pos;
5682
5683   while (buf < buf_end)
5684     {
5685       if (pos == stop)
5686         {
5687           int *p;
5688
5689           if (pos == end_pos)
5690             break;
5691           p = save_composition_data (buf, buf_end, prop);
5692           if (p == NULL)
5693             break;
5694           buf = p;
5695           if (find_composition (end, end_pos, &start, &end, &prop, object)
5696               && end <= end_pos)
5697             stop = start;
5698           else
5699             stop = end_pos;
5700         }
5701
5702       if (! multibytep)
5703         c = *src++;
5704       else
5705         c = STRING_CHAR_ADVANCE (src);
5706       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5707         c = '\n';
5708       if (! EQ (eol_type, Qunix))
5709         {
5710           if (c == '\n')
5711             {
5712               if (EQ (eol_type, Qdos))
5713                 *buf++ = '\r';
5714               else
5715                 c = '\r';
5716             }
5717         }
5718       *buf++ = c;
5719       pos++;
5720     }
5721
5722   coding->consumed = src - coding->source;
5723   coding->consumed_char = pos - coding->src_pos;
5724   coding->charbuf_used = buf - coding->charbuf;
5725   coding->chars_at_source = 0;
5726 }
5727
5728
5729 /* Encode the text at CODING->src_object into CODING->dst_object.
5730    CODING->src_object is a buffer or a string.
5731    CODING->dst_object is a buffer or nil.
5732
5733    If CODING->src_object is a buffer, it must be the current buffer.
5734    In this case, if CODING->src_pos is positive, it is a position of
5735    the source text in the buffer, otherwise. the source text is in the
5736    gap area of the buffer, and coding->src_pos specifies the offset of
5737    the text from GPT (which must be the same as PT).  If this is the
5738    same buffer as CODING->dst_object, CODING->src_pos must be
5739    negative and CODING should not have `pre-write-conversion'.
5740
5741    If CODING->src_object is a string, CODING should not have
5742    `pre-write-conversion'.
5743
5744    If CODING->dst_object is a buffer, the encoded data is inserted at
5745    the current point of that buffer.
5746
5747    If CODING->dst_object is nil, the encoded data is placed at the
5748    memory area specified by CODING->destination.  */
5749
5750 static int
5751 encode_coding (coding)
5752      struct coding_system *coding;
5753 {
5754   Lisp_Object attrs;
5755
5756   attrs = CODING_ID_ATTRS (coding->id);
5757
5758   if (BUFFERP (coding->dst_object))
5759     {
5760       set_buffer_internal (XBUFFER (coding->dst_object));
5761       coding->dst_multibyte
5762         = ! NILP (current_buffer->enable_multibyte_characters);
5763     }
5764
5765   coding->consumed = coding->consumed_char = 0;
5766   coding->produced = coding->produced_char = 0;
5767   coding->result = CODING_RESULT_SUCCESS;
5768   coding->errors = 0;
5769
5770   ALLOC_CONVERSION_WORK_AREA (coding);
5771
5772   do {
5773     coding_set_source (coding);
5774     consume_chars (coding);
5775
5776     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5777       translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5778
5779     coding_set_destination (coding);
5780     (*(coding->encoder)) (coding);
5781   } while (coding->consumed_char < coding->src_chars);
5782
5783   if (BUFFERP (coding->dst_object))
5784     insert_from_gap (coding->produced_char, coding->produced);
5785
5786   return (coding->result);
5787 }
5788
5789 /* Work buffer */
5790
5791 /* List of currently used working buffer.  */
5792 Lisp_Object Vcode_conversion_work_buf_list;
5793
5794 /* A working buffer used by the top level conversion.  */
5795 Lisp_Object Vcode_conversion_reused_work_buf;
5796
5797
5798 /* Return a working buffer that can be freely used by the following
5799    code conversion.  MULTIBYTEP specifies the multibyteness of the
5800    buffer.  */
5801
5802 Lisp_Object
5803 make_conversion_work_buffer (multibytep)
5804      int multibytep;
5805 {
5806   struct buffer *current = current_buffer;
5807   Lisp_Object buf;
5808
5809   if (NILP (Vcode_conversion_work_buf_list))
5810     {
5811       if (NILP (Vcode_conversion_reused_work_buf))
5812         Vcode_conversion_reused_work_buf
5813           = Fget_buffer_create (build_string (" *code-conversion-work*"));
5814       Vcode_conversion_work_buf_list
5815         = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5816     }
5817   else
5818     {
5819       int depth = Flength (Vcode_conversion_work_buf_list);
5820       char str[128];
5821
5822       sprintf (str, " *code-conversion-work*<%d>", depth);
5823       Vcode_conversion_work_buf_list
5824         = Fcons (Fget_buffer_create (build_string (str)),
5825                  Vcode_conversion_work_buf_list);
5826     }
5827
5828   buf = XCAR (Vcode_conversion_work_buf_list);
5829   set_buffer_internal (XBUFFER (buf));
5830   current_buffer->undo_list = Qt;
5831   Ferase_buffer ();
5832   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5833   set_buffer_internal (current);
5834   return buf;
5835 }
5836
5837 static struct coding_system *saved_coding;
5838
5839 Lisp_Object
5840 code_conversion_restore (info)
5841      Lisp_Object info;
5842 {
5843   int depth = Flength (Vcode_conversion_work_buf_list);
5844   Lisp_Object buf;
5845
5846   if (depth > 0)
5847     {
5848       buf = XCAR (Vcode_conversion_work_buf_list);
5849       Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5850       if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5851         Fkill_buffer (buf);
5852     }
5853
5854   if (saved_coding->dst_object == Qt
5855       && saved_coding->destination)
5856     xfree (saved_coding->destination);
5857
5858   return save_excursion_restore (info);
5859 }
5860
5861
5862 int
5863 decode_coding_gap (coding, chars, bytes)
5864      struct coding_system *coding;
5865      EMACS_INT chars, bytes;
5866 {
5867   int count = specpdl_ptr - specpdl;
5868
5869   saved_coding = coding;
5870   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5871
5872   coding->src_object = Fcurrent_buffer ();
5873   coding->src_chars = chars;
5874   coding->src_bytes = bytes;
5875   coding->src_pos = -chars;
5876   coding->src_pos_byte = -bytes;
5877   coding->src_multibyte = chars < bytes;
5878   coding->dst_object = coding->src_object;
5879   coding->dst_pos = PT;
5880   coding->dst_pos_byte = PT_BYTE;
5881   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5882
5883   if (CODING_REQUIRE_DETECTION (coding))
5884     detect_coding (coding);
5885
5886   decode_coding (coding);
5887
5888   unbind_to (count, Qnil);
5889   return coding->result;
5890 }
5891
5892 int
5893 encode_coding_gap (coding, chars, bytes)
5894      struct coding_system *coding;
5895      EMACS_INT chars, bytes;
5896 {
5897   int count = specpdl_ptr - specpdl;
5898   Lisp_Object buffer;
5899
5900   saved_coding = coding;
5901   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5902
5903   buffer = Fcurrent_buffer ();
5904   coding->src_object = buffer;
5905   coding->src_chars = chars;
5906   coding->src_bytes = bytes;
5907   coding->src_pos = -chars;
5908   coding->src_pos_byte = -bytes;
5909   coding->src_multibyte = chars < bytes;
5910   coding->dst_object = coding->src_object;
5911   coding->dst_pos = PT;
5912   coding->dst_pos_byte = PT_BYTE;
5913
5914   encode_coding (coding);
5915
5916   unbind_to (count, Qnil);
5917   return coding->result;
5918 }
5919
5920
5921 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5922    SRC_OBJECT into DST_OBJECT by coding context CODING.
5923
5924    SRC_OBJECT is a buffer, a string, or Qnil.
5925
5926    If it is a buffer, the text is at point of the buffer.  FROM and TO
5927    are positions in the buffer.
5928
5929    If it is a string, the text is at the beginning of the string.
5930    FROM and TO are indices to the string.
5931
5932    If it is nil, the text is at coding->source.  FROM and TO are
5933    indices to coding->source.
5934
5935    DST_OBJECT is a buffer, Qt, or Qnil.
5936
5937    If it is a buffer, the decoded text is inserted at point of the
5938    buffer.  If the buffer is the same as SRC_OBJECT, the source text
5939    is deleted.
5940
5941    If it is Qt, a string is made from the decoded text, and
5942    set in CODING->dst_object.
5943
5944    If it is Qnil, the decoded text is stored at CODING->destination.
5945    The called must allocate CODING->dst_bytes bytes at
5946    CODING->destination by xmalloc.  If the decoded text is longer than
5947    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5948  */
5949
5950 void
5951 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5952                       dst_object)
5953      struct coding_system *coding;
5954      Lisp_Object src_object;
5955      EMACS_INT from, from_byte, to, to_byte;
5956      Lisp_Object dst_object;
5957 {
5958   int count = specpdl_ptr - specpdl;
5959   unsigned char *destination;
5960   EMACS_INT dst_bytes;
5961   EMACS_INT chars = to - from;
5962   EMACS_INT bytes = to_byte - from_byte;
5963   Lisp_Object attrs;
5964
5965   saved_coding = coding;
5966   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5967
5968   if (NILP (dst_object))
5969     {
5970       destination = coding->destination;
5971       dst_bytes = coding->dst_bytes;
5972     }
5973
5974   coding->src_object = src_object;
5975   coding->src_chars = chars;
5976   coding->src_bytes = bytes;
5977   coding->src_multibyte = chars < bytes;
5978
5979   if (STRINGP (src_object))
5980     {
5981       coding->src_pos = from;
5982       coding->src_pos_byte = from_byte;
5983     }
5984   else if (BUFFERP (src_object))
5985     {
5986       set_buffer_internal (XBUFFER (src_object));
5987       if (from != GPT)
5988         move_gap_both (from, from_byte);
5989       if (EQ (src_object, dst_object))
5990         {
5991           TEMP_SET_PT_BOTH (from, from_byte);
5992           del_range_both (from, from_byte, to, to_byte, 1);
5993           coding->src_pos = -chars;
5994           coding->src_pos_byte = -bytes;
5995         }
5996       else
5997         {
5998           coding->src_pos = from;
5999           coding->src_pos_byte = from_byte;
6000         }
6001     }
6002
6003   if (CODING_REQUIRE_DETECTION (coding))
6004     detect_coding (coding);
6005   attrs = CODING_ID_ATTRS (coding->id);
6006
6007   if (! NILP (CODING_ATTR_POST_READ (attrs))
6008       || EQ (dst_object, Qt))
6009     {
6010       coding->dst_object = make_conversion_work_buffer (1);
6011       coding->dst_pos = BEG;
6012       coding->dst_pos_byte = BEG_BYTE;
6013       coding->dst_multibyte = 1;
6014     }
6015   else if (BUFFERP (dst_object))
6016     {
6017       coding->dst_object = dst_object;
6018       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6019       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6020       coding->dst_multibyte
6021         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6022     }
6023   else
6024     {
6025       coding->dst_object = Qnil;
6026       coding->dst_multibyte = 1;
6027     }
6028
6029   decode_coding (coding);
6030
6031   if (BUFFERP (coding->dst_object))
6032     set_buffer_internal (XBUFFER (coding->dst_object));
6033
6034   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6035     {
6036       struct gcpro gcpro1, gcpro2;
6037       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6038       Lisp_Object val;
6039
6040       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6041       GCPRO2 (coding->src_object, coding->dst_object);
6042       val = call1 (CODING_ATTR_POST_READ (attrs),
6043                    make_number (coding->produced_char));
6044       UNGCPRO;
6045       CHECK_NATNUM (val);
6046       coding->produced_char += Z - prev_Z;
6047       coding->produced += Z_BYTE - prev_Z_BYTE;
6048     }
6049
6050   if (EQ (dst_object, Qt))
6051     {
6052       coding->dst_object = Fbuffer_string ();
6053     }
6054   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6055     {
6056       set_buffer_internal (XBUFFER (coding->dst_object));
6057       if (dst_bytes < coding->produced)
6058         {
6059           destination
6060             = (unsigned char *) xrealloc (destination, coding->produced);
6061           if (! destination)
6062             {
6063               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6064               unbind_to (count, Qnil);
6065               return;
6066             }
6067           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6068             move_gap_both (BEGV, BEGV_BYTE);
6069           bcopy (BEGV_ADDR, destination, coding->produced);
6070           coding->destination = destination;
6071         }
6072     }
6073
6074   unbind_to (count, Qnil);
6075 }
6076
6077
6078 void
6079 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6080                       dst_object)
6081      struct coding_system *coding;
6082      Lisp_Object src_object;
6083      EMACS_INT from, from_byte, to, to_byte;
6084      Lisp_Object dst_object;
6085 {
6086   int count = specpdl_ptr - specpdl;
6087   EMACS_INT chars = to - from;
6088   EMACS_INT bytes = to_byte - from_byte;
6089   Lisp_Object attrs;
6090
6091   saved_coding = coding;
6092   record_unwind_protect (code_conversion_restore, save_excursion_save ());
6093
6094   coding->src_object = src_object;
6095   coding->src_chars = chars;
6096   coding->src_bytes = bytes;
6097   coding->src_multibyte = chars < bytes;
6098
6099   attrs = CODING_ID_ATTRS (coding->id);
6100
6101   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6102     {
6103       coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6104       set_buffer_internal (XBUFFER (coding->src_object));
6105       if (STRINGP (src_object))
6106         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6107       else if (BUFFERP (src_object))
6108         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6109       else
6110         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6111
6112       if (EQ (src_object, dst_object))
6113         {
6114           set_buffer_internal (XBUFFER (src_object));
6115           del_range_both (from, from_byte, to, to_byte, 1);
6116           set_buffer_internal (XBUFFER (coding->src_object));
6117         }
6118
6119       call2 (CODING_ATTR_PRE_WRITE (attrs),
6120              make_number (BEG), make_number (Z));
6121       coding->src_object = Fcurrent_buffer ();
6122       if (BEG != GPT)
6123         move_gap_both (BEG, BEG_BYTE);
6124       coding->src_chars = Z - BEG;
6125       coding->src_bytes = Z_BYTE - BEG_BYTE;
6126       coding->src_pos = BEG;
6127       coding->src_pos_byte = BEG_BYTE;
6128       coding->src_multibyte = Z < Z_BYTE;
6129     }
6130   else if (STRINGP (src_object))
6131     {
6132       coding->src_pos = from;
6133       coding->src_pos_byte = from_byte;
6134     }
6135   else if (BUFFERP (src_object))
6136     {
6137       set_buffer_internal (XBUFFER (src_object));
6138       if (from != GPT)
6139         move_gap_both (from, from_byte);
6140       if (EQ (src_object, dst_object))
6141         {
6142           del_range_both (from, from_byte, to, to_byte, 1);
6143           coding->src_pos = -chars;
6144           coding->src_pos_byte = -bytes;
6145         }
6146       else
6147         {
6148           coding->src_pos = from;
6149           coding->src_pos_byte = from_byte;
6150         }
6151     }
6152
6153   if (BUFFERP (dst_object))
6154     {
6155       coding->dst_object = dst_object;
6156       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6157       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6158       coding->dst_multibyte
6159         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6160     }
6161   else if (EQ (dst_object, Qt))
6162     {
6163       coding->dst_object = Qnil;
6164       coding->dst_bytes = coding->src_chars;
6165       if (coding->dst_bytes == 0)
6166         coding->dst_bytes = 1;
6167       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6168       coding->dst_multibyte = 0;
6169     }
6170   else
6171     {
6172       coding->dst_object = Qnil;
6173       coding->dst_multibyte = 0;
6174     }
6175
6176   encode_coding (coding);
6177
6178   if (EQ (dst_object, Qt))
6179     {
6180       if (BUFFERP (coding->dst_object))
6181         coding->dst_object = Fbuffer_string ();
6182       else
6183         {
6184           coding->dst_object
6185             = make_unibyte_string ((char *) coding->destination,
6186                                    coding->produced);
6187           xfree (coding->destination);
6188         }
6189     }
6190
6191   unbind_to (count, Qnil);
6192 }
6193
6194
6195 Lisp_Object
6196 preferred_coding_system ()
6197 {
6198   int id = coding_categories[coding_priorities[0]].id;
6199
6200   return CODING_ID_NAME (id);
6201 }
6202
6203 \f
6204 #ifdef emacs
6205 /*** 8. Emacs Lisp library functions ***/
6206
6207 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6208        doc: /* Return t if OBJECT is nil or a coding-system.
6209 See the documentation of `define-coding-system' for information
6210 about coding-system objects.  */)
6211      (obj)
6212      Lisp_Object obj;
6213 {
6214   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6215 }
6216
6217 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6218        Sread_non_nil_coding_system, 1, 1, 0,
6219        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6220      (prompt)
6221      Lisp_Object prompt;
6222 {
6223   Lisp_Object val;
6224   do
6225     {
6226       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6227                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6228     }
6229   while (XSTRING (val)->size == 0);
6230   return (Fintern (val, Qnil));
6231 }
6232
6233 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6234        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6235 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6236      (prompt, default_coding_system)
6237      Lisp_Object prompt, default_coding_system;
6238 {
6239   Lisp_Object val;
6240   if (SYMBOLP (default_coding_system))
6241     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6242   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6243                           Qt, Qnil, Qcoding_system_history,
6244                           default_coding_system, Qnil);
6245   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6246 }
6247
6248 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6249        1, 1, 0,
6250        doc: /* Check validity of CODING-SYSTEM.
6251 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6252 It is valid if it is a symbol with a non-nil `coding-system' property.
6253 The value of property should be a vector of length 5.  */)
6254   (coding_system)
6255      Lisp_Object coding_system;
6256 {
6257   CHECK_SYMBOL (coding_system);
6258   if (!NILP (Fcoding_system_p (coding_system)))
6259     return coding_system;
6260   while (1)
6261     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6262 }
6263
6264 \f
6265 Lisp_Object
6266 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6267      unsigned char *src;
6268      int src_bytes, highest;
6269      int multibytep;
6270      Lisp_Object coding_system;
6271 {
6272   unsigned char *src_end = src + src_bytes;
6273   int mask = CATEGORY_MASK_ANY;
6274   int detected = 0;
6275   int c, i;
6276   Lisp_Object attrs, eol_type;
6277   Lisp_Object val;
6278   struct coding_system coding;
6279
6280   if (NILP (coding_system))
6281     coding_system = Qundecided;
6282   setup_coding_system (coding_system, &coding);
6283   attrs = CODING_ID_ATTRS (coding.id);
6284   eol_type = CODING_ID_EOL_TYPE (coding.id);
6285
6286   coding.source = src;
6287   coding.src_bytes = src_bytes;
6288   coding.src_multibyte = multibytep;
6289   coding.consumed = 0;
6290
6291   if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6292     {
6293       mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6294     }
6295   else
6296     {
6297       coding_system = Qnil;
6298       for (; src < src_end; src++)
6299         {
6300           c = *src;
6301           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6302                                         || c == ISO_CODE_SI
6303                                         || c == ISO_CODE_SO)))
6304             break;
6305         }
6306       coding.head_ascii = src - coding.source;
6307
6308       if (src < src_end)
6309         for (i = 0; i < coding_category_raw_text; i++)
6310           {
6311             enum coding_category category = coding_priorities[i];
6312             struct coding_system *this = coding_categories + category;
6313
6314             if (category >= coding_category_raw_text
6315                 || detected & (1 << category))
6316               continue;
6317
6318             if (this->id < 0)
6319               {
6320                 /* No coding system of this category is defined.  */
6321                 mask &= ~(1 << category);
6322               }
6323             else
6324               {
6325                 detected |= detected_mask[category];
6326                 if ((*(coding_categories[category].detector)) (&coding, &mask)
6327                     && highest)
6328                   {
6329                     mask &= detected_mask[category];
6330                     break;
6331                   }
6332               }
6333           }
6334     }
6335
6336   if (!mask)
6337     val = Fcons (make_number (coding_category_raw_text), Qnil);
6338   else if (mask == CATEGORY_MASK_ANY)
6339     val = Fcons (make_number (coding_category_undecided), Qnil);
6340   else if (highest)
6341     {
6342       for (i = 0; i < coding_category_raw_text; i++)
6343         if (mask & (1 << coding_priorities[i]))
6344           {
6345             val = Fcons (make_number (coding_priorities[i]), Qnil);
6346             break;
6347           }
6348     }
6349   else
6350     {
6351       val = Qnil;
6352       for (i = coding_category_raw_text - 1; i >= 0; i--)
6353         if (mask & (1 << coding_priorities[i]))
6354           val = Fcons (make_number (coding_priorities[i]), val);
6355     }
6356
6357   {
6358     int one_byte_eol = -1, two_byte_eol = -1;
6359     Lisp_Object tail;
6360
6361     for (tail = val; CONSP (tail); tail = XCDR (tail))
6362       {
6363         struct coding_system *this
6364           = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6365              : &coding);
6366         int this_eol;
6367
6368         attrs = CODING_ID_ATTRS (this->id);
6369         eol_type = CODING_ID_EOL_TYPE (this->id);
6370         XSETCAR (tail, CODING_ID_NAME (this->id));
6371         if (VECTORP (eol_type))
6372           {
6373             if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6374               {
6375                 if (two_byte_eol < 0)
6376                   two_byte_eol = detect_eol (this, coding.source, src_bytes);
6377                 this_eol = two_byte_eol;
6378               }
6379             else
6380               {
6381                 if (one_byte_eol < 0)
6382                   one_byte_eol =detect_eol (this, coding.source, src_bytes);
6383                 this_eol = one_byte_eol;
6384               }
6385             if (this_eol == EOL_SEEN_LF)
6386               XSETCAR (tail, AREF (eol_type, 0));
6387             else if (this_eol == EOL_SEEN_CRLF)
6388               XSETCAR (tail, AREF (eol_type, 1));
6389             else if (this_eol == EOL_SEEN_CR)
6390               XSETCAR (tail, AREF (eol_type, 2));
6391           }
6392       }
6393   }
6394
6395   return (highest ? XCAR (val) : val);
6396 }
6397
6398
6399 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6400        2, 3, 0,
6401        doc: /* Detect coding system of the text in the region between START and END.
6402 Return a list of possible coding systems ordered by priority.
6403
6404 If only ASCII characters are found, it returns a list of single element
6405 `undecided' or its subsidiary coding system according to a detected
6406 end-of-line format.
6407
6408 If optional argument HIGHEST is non-nil, return the coding system of
6409 highest priority.  */)
6410      (start, end, highest)
6411      Lisp_Object start, end, highest;
6412 {
6413   int from, to;
6414   int from_byte, to_byte;
6415
6416   CHECK_NUMBER_COERCE_MARKER (start);
6417   CHECK_NUMBER_COERCE_MARKER (end);
6418
6419   validate_region (&start, &end);
6420   from = XINT (start), to = XINT (end);
6421   from_byte = CHAR_TO_BYTE (from);
6422   to_byte = CHAR_TO_BYTE (to);
6423
6424   if (from < GPT && to >= GPT)
6425     move_gap_both (to, to_byte);
6426
6427   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6428                                to_byte - from_byte,
6429                                !NILP (highest),
6430                                !NILP (current_buffer
6431                                       ->enable_multibyte_characters),
6432                                Qnil);
6433 }
6434
6435 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6436        1, 2, 0,
6437        doc: /* Detect coding system of the text in STRING.
6438 Return a list of possible coding systems ordered by priority.
6439
6440 If only ASCII characters are found, it returns a list of single element
6441 `undecided' or its subsidiary coding system according to a detected
6442 end-of-line format.
6443
6444 If optional argument HIGHEST is non-nil, return the coding system of
6445 highest priority.  */)
6446      (string, highest)
6447      Lisp_Object string, highest;
6448 {
6449   CHECK_STRING (string);
6450
6451   return detect_coding_system (XSTRING (string)->data,
6452                                STRING_BYTES (XSTRING (string)),
6453                                !NILP (highest),
6454                                STRING_MULTIBYTE (string),
6455                                Qnil);
6456 }
6457
6458
6459 static INLINE int
6460 char_encodable_p (c, attrs)
6461      int c;
6462      Lisp_Object attrs;
6463 {
6464   Lisp_Object tail;
6465   struct charset *charset;
6466
6467   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6468        CONSP (tail); tail = XCDR (tail))
6469     {
6470       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6471       if (CHAR_CHARSET_P (c, charset))
6472         break;
6473     }
6474   return (! NILP (tail));
6475 }
6476
6477
6478 /* Return a list of coding systems that safely encode the text between
6479    START and END.  If EXCLUDE is non-nil, it is a list of coding
6480    systems not to check.  The returned list doesn't contain any such
6481    coding systems.  In any case, If the text contains only ASCII or is
6482    unibyte, return t.  */
6483
6484 DEFUN ("find-coding-systems-region-internal",
6485        Ffind_coding_systems_region_internal,
6486        Sfind_coding_systems_region_internal, 2, 3, 0,
6487        doc: /* Internal use only.  */)
6488      (start, end, exclude)
6489      Lisp_Object start, end, exclude;
6490 {
6491   Lisp_Object coding_attrs_list, safe_codings;
6492   EMACS_INT start_byte, end_byte;
6493   unsigned char *p, *pbeg, *pend;
6494   int c;
6495   Lisp_Object tail, elt;
6496
6497   if (STRINGP (start))
6498     {
6499       if (!STRING_MULTIBYTE (start)
6500           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6501         return Qt;
6502       start_byte = 0;
6503       end_byte = STRING_BYTES (XSTRING (start));
6504     }
6505   else
6506     {
6507       CHECK_NUMBER_COERCE_MARKER (start);
6508       CHECK_NUMBER_COERCE_MARKER (end);
6509       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6510         args_out_of_range (start, end);
6511       if (NILP (current_buffer->enable_multibyte_characters))
6512         return Qt;
6513       start_byte = CHAR_TO_BYTE (XINT (start));
6514       end_byte = CHAR_TO_BYTE (XINT (end));
6515       if (XINT (end) - XINT (start) == end_byte - start_byte)
6516         return Qt;
6517
6518       if (start < GPT && end > GPT)
6519         {
6520           if ((GPT - start) < (end - GPT))
6521             move_gap_both (start, start_byte);
6522           else
6523             move_gap_both (end, end_byte);
6524         }
6525     }
6526
6527   coding_attrs_list = Qnil;
6528   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6529     if (NILP (exclude)
6530         || NILP (Fmemq (XCAR (tail), exclude)))
6531       {
6532         Lisp_Object attrs;
6533
6534         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6535         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6536             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6537           coding_attrs_list = Fcons (attrs, coding_attrs_list);
6538       }
6539
6540   if (STRINGP (start))
6541     p = pbeg = XSTRING (start)->data;
6542   else
6543     p = pbeg = BYTE_POS_ADDR (start_byte);
6544   pend = p + (end_byte - start_byte);
6545
6546   while (p < pend && ASCII_BYTE_P (*p)) p++;
6547   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6548
6549   while (p < pend)
6550     {
6551       if (ASCII_BYTE_P (*p))
6552         p++;
6553       else
6554         {
6555           c = STRING_CHAR_ADVANCE (p);
6556
6557           charset_map_loaded = 0;
6558           for (tail = coding_attrs_list; CONSP (tail);)
6559             {
6560               elt = XCAR (tail);
6561               if (NILP (elt))
6562                 tail = XCDR (tail);
6563               else if (char_encodable_p (c, elt))
6564                 tail = XCDR (tail);
6565               else if (CONSP (XCDR (tail)))
6566                 {
6567                   XSETCAR (tail, XCAR (XCDR (tail)));
6568                   XSETCDR (tail, XCDR (XCDR (tail)));
6569                 }
6570               else
6571                 {
6572                   XSETCAR (tail, Qnil);
6573                   tail = XCDR (tail);
6574                 }
6575             }
6576           if (charset_map_loaded)
6577             {
6578               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6579
6580               if (STRINGP (start))
6581                 pbeg = XSTRING (start)->data;
6582               else
6583                 pbeg = BYTE_POS_ADDR (start_byte);
6584               p = pbeg + p_offset;
6585               pend = pbeg + pend_offset;
6586             }
6587         }
6588     }
6589
6590   safe_codings = Qnil;
6591   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6592     if (! NILP (XCAR (tail)))
6593       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6594
6595   return safe_codings;
6596 }
6597
6598
6599 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6600        Scheck_coding_systems_region, 3, 3, 0,
6601        doc: /* Check if the region is encodable by coding systems.
6602
6603 START and END are buffer positions specifying the region.
6604 CODING-SYSTEM-LIST is a list of coding systems to check.
6605
6606 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6607 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6608 whole region, POS0, POS1, ... are buffer positions where non-encodable
6609 characters are found.
6610
6611 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6612 value is nil.
6613
6614 START may be a string.  In that case, check if the string is
6615 encodable, and the value contains indices to the string instead of
6616 buffer positions.  END is ignored.  */)
6617      (start, end, coding_system_list)
6618      Lisp_Object start, end, coding_system_list;
6619 {
6620   Lisp_Object list;
6621   EMACS_INT start_byte, end_byte;
6622   int pos;
6623   unsigned char *p, *pbeg, *pend;
6624   int c;
6625   Lisp_Object tail, elt;
6626
6627   if (STRINGP (start))
6628     {
6629       if (!STRING_MULTIBYTE (start)
6630           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6631         return Qnil;
6632       start_byte = 0;
6633       end_byte = STRING_BYTES (XSTRING (start));
6634       pos = 0;
6635     }
6636   else
6637     {
6638       CHECK_NUMBER_COERCE_MARKER (start);
6639       CHECK_NUMBER_COERCE_MARKER (end);
6640       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6641         args_out_of_range (start, end);
6642       if (NILP (current_buffer->enable_multibyte_characters))
6643         return Qnil;
6644       start_byte = CHAR_TO_BYTE (XINT (start));
6645       end_byte = CHAR_TO_BYTE (XINT (end));
6646       if (XINT (end) - XINT (start) == end_byte - start_byte)
6647         return Qt;
6648
6649       if (start < GPT && end > GPT)
6650         {
6651           if ((GPT - start) < (end - GPT))
6652             move_gap_both (start, start_byte);
6653           else
6654             move_gap_both (end, end_byte);
6655         }
6656       pos = start;
6657     }
6658
6659   list = Qnil;
6660   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6661     {
6662       elt = XCAR (tail);
6663       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6664                                        Qnil)),
6665                     list);
6666     }
6667
6668   if (STRINGP (start))
6669     p = pbeg = XSTRING (start)->data;
6670   else
6671     p = pbeg = BYTE_POS_ADDR (start_byte);
6672   pend = p + (end_byte - start_byte);
6673
6674   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6675   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6676
6677   while (p < pend)
6678     {
6679       if (ASCII_BYTE_P (*p))
6680         p++;
6681       else
6682         {
6683           c = STRING_CHAR_ADVANCE (p);
6684
6685           charset_map_loaded = 0;
6686           for (tail = list; CONSP (tail); tail = XCDR (tail))
6687             {
6688               elt = XCDR (XCAR (tail));
6689               if (! char_encodable_p (c, XCAR (elt)))
6690                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6691             }
6692           if (charset_map_loaded)
6693             {
6694               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6695
6696               if (STRINGP (start))
6697                 pbeg = XSTRING (start)->data;
6698               else
6699                 pbeg = BYTE_POS_ADDR (start_byte);
6700               p = pbeg + p_offset;
6701               pend = pbeg + pend_offset;
6702             }
6703         }
6704       pos++;
6705     }
6706
6707   tail = list;
6708   list = Qnil;
6709   for (; CONSP (tail); tail = XCDR (tail))
6710     {
6711       elt = XCAR (tail);
6712       if (CONSP (XCDR (XCDR (elt))))
6713         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6714                       list);
6715     }
6716
6717   return list;
6718 }
6719
6720
6721
6722 Lisp_Object
6723 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6724      Lisp_Object start, end, coding_system, dst_object;
6725      int encodep, norecord;
6726 {
6727   struct coding_system coding;
6728   EMACS_INT from, from_byte, to, to_byte;
6729   Lisp_Object src_object;
6730
6731   CHECK_NUMBER_COERCE_MARKER (start);
6732   CHECK_NUMBER_COERCE_MARKER (end);
6733   if (NILP (coding_system))
6734     coding_system = Qno_conversion;
6735   else
6736     CHECK_CODING_SYSTEM (coding_system);
6737   src_object = Fcurrent_buffer ();
6738   if (NILP (dst_object))
6739     dst_object = src_object;
6740   else if (! EQ (dst_object, Qt))
6741     CHECK_BUFFER (dst_object);
6742
6743   validate_region (&start, &end);
6744   from = XFASTINT (start);
6745   from_byte = CHAR_TO_BYTE (from);
6746   to = XFASTINT (end);
6747   to_byte = CHAR_TO_BYTE (to);
6748
6749   setup_coding_system (coding_system, &coding);
6750   coding.mode |= CODING_MODE_LAST_BLOCK;
6751
6752   if (encodep)
6753     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6754                           dst_object);
6755   else
6756     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6757                           dst_object);
6758   if (! norecord)
6759     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6760
6761   if (coding.result != CODING_RESULT_SUCCESS)
6762     error ("Code conversion error: %d", coding.result);
6763
6764   return (BUFFERP (dst_object)
6765           ? make_number (coding.produced_char)
6766           : coding.dst_object);
6767 }
6768
6769
6770 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6771        3, 4, "r\nzCoding system: ",
6772        doc: /* Decode the current region from the specified coding system.
6773 When called from a program, takes four arguments:
6774         START, END, CODING-SYSTEM, and DESTINATION.
6775 START and END are buffer positions.
6776
6777 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6778 If nil, the region between START and END is replace by the decoded text.
6779 If buffer, the decoded text is inserted in the buffer.
6780 If t, the decoded text is returned.
6781
6782 This function sets `last-coding-system-used' to the precise coding system
6783 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6784 not fully specified.)
6785 It returns the length of the decoded text.  */)
6786      (start, end, coding_system, destination)
6787      Lisp_Object start, end, coding_system, destination;
6788 {
6789   return code_convert_region (start, end, coding_system, destination, 0, 0);
6790 }
6791
6792 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6793        3, 4, "r\nzCoding system: ",
6794        doc: /* Encode the current region by specified coding system.
6795 When called from a program, takes three arguments:
6796 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6797
6798 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6799 If nil, the region between START and END is replace by the encoded text.
6800 If buffer, the encoded text is inserted in the buffer.
6801 If t, the encoded text is returned.
6802
6803 This function sets `last-coding-system-used' to the precise coding system
6804 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6805 not fully specified.)
6806 It returns the length of the encoded text.  */)
6807   (start, end, coding_system, destination)
6808      Lisp_Object start, end, coding_system, destination;
6809 {
6810   return code_convert_region (start, end, coding_system, destination, 1, 0);
6811 }
6812
6813 Lisp_Object
6814 code_convert_string (string, coding_system, dst_object,
6815                      encodep, nocopy, norecord)
6816      Lisp_Object string, coding_system, dst_object;
6817      int encodep, nocopy, norecord;
6818 {
6819   struct coding_system coding;
6820   EMACS_INT chars, bytes;
6821
6822   CHECK_STRING (string);
6823   if (NILP (coding_system))
6824     {
6825       if (! norecord)
6826         Vlast_coding_system_used = Qno_conversion;
6827       if (NILP (dst_object))
6828         return (nocopy ? Fcopy_sequence (string) : string);
6829     }
6830
6831   if (NILP (coding_system))
6832     coding_system = Qno_conversion;
6833   else
6834     CHECK_CODING_SYSTEM (coding_system);
6835   if (NILP (dst_object))
6836     dst_object = Qt;
6837   else if (! EQ (dst_object, Qt))
6838     CHECK_BUFFER (dst_object);
6839
6840   setup_coding_system (coding_system, &coding);
6841   coding.mode |= CODING_MODE_LAST_BLOCK;
6842   chars = XSTRING (string)->size;
6843   bytes = STRING_BYTES (XSTRING (string));
6844   if (encodep)
6845     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6846   else
6847     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6848   if (! norecord)
6849     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6850
6851   if (coding.result != CODING_RESULT_SUCCESS)
6852     error ("Code conversion error: %d", coding.result);
6853
6854   return (BUFFERP (dst_object)
6855           ? make_number (coding.produced_char)
6856           : coding.dst_object);
6857 }
6858
6859
6860 /* Encode or decode STRING according to CODING_SYSTEM.
6861    Do not set Vlast_coding_system_used.
6862
6863    This function is called only from macros DECODE_FILE and
6864    ENCODE_FILE, thus we ignore character composition.  */
6865
6866 Lisp_Object
6867 code_convert_string_norecord (string, coding_system, encodep)
6868      Lisp_Object string, coding_system;
6869      int encodep;
6870 {
6871   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6872 }
6873
6874
6875 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6876        2, 4, 0,
6877        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6878
6879 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6880 if the decoding operation is trivial.
6881
6882 Optional fourth arg BUFFER non-nil meant that the decoded text is
6883 inserted in BUFFER instead of returned as a astring.  In this case,
6884 the return value is BUFFER.
6885
6886 This function sets `last-coding-system-used' to the precise coding system
6887 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6888 not fully specified.  */)
6889   (string, coding_system, nocopy, buffer)
6890      Lisp_Object string, coding_system, nocopy, buffer;
6891 {
6892   return code_convert_string (string, coding_system, buffer,
6893                               0, ! NILP (nocopy), 0);
6894 }
6895
6896 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6897        2, 4, 0,
6898        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6899
6900 Optional third arg NOCOPY non-nil means it is OK to return STRING
6901 itself if the encoding operation is trivial.
6902
6903 Optional fourth arg BUFFER non-nil meant that the encoded text is
6904 inserted in BUFFER instead of returned as a astring.  In this case,
6905 the return value is BUFFER.
6906
6907 This function sets `last-coding-system-used' to the precise coding system
6908 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6909 not fully specified.)  */)
6910      (string, coding_system, nocopy, buffer)
6911      Lisp_Object string, coding_system, nocopy, buffer;
6912 {
6913   return code_convert_string (string, coding_system, buffer,
6914                               nocopy, ! NILP (nocopy), 1);
6915 }
6916
6917 \f
6918 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6919        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6920 Return the corresponding character.  */)
6921      (code)
6922      Lisp_Object code;
6923 {
6924   Lisp_Object spec, attrs, val;
6925   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6926   int c;
6927
6928   CHECK_NATNUM (code);
6929   c = XFASTINT (code);
6930   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6931   attrs = AREF (spec, 0);
6932
6933   if (ASCII_BYTE_P (c)
6934       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6935     return code;
6936
6937   val = CODING_ATTR_CHARSET_LIST (attrs);
6938   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6939   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6940   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6941
6942   if (c <= 0x7F)
6943     charset = charset_roman;
6944   else if (c >= 0xA0 && c < 0xDF)
6945     {
6946       charset = charset_kana;
6947       c -= 0x80;
6948     }
6949   else
6950     {
6951       int s1 = c >> 8, s2 = c & 0xFF;
6952
6953       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6954           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6955         error ("Invalid code: %d", code);
6956       SJIS_TO_JIS (c);
6957       charset = charset_kanji;
6958     }
6959   c = DECODE_CHAR (charset, c);
6960   if (c < 0)
6961     error ("Invalid code: %d", code);
6962   return make_number (c);
6963 }
6964
6965
6966 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6967        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6968 Return the corresponding code in SJIS.  */)
6969      (ch)
6970     Lisp_Object ch;
6971 {
6972   Lisp_Object spec, attrs, charset_list;
6973   int c;
6974   struct charset *charset;
6975   unsigned code;
6976
6977   CHECK_CHARACTER (ch);
6978   c = XFASTINT (ch);
6979   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6980   attrs = AREF (spec, 0);
6981
6982   if (ASCII_CHAR_P (c)
6983       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6984     return ch;
6985
6986   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6987   charset = char_charset (c, charset_list, &code);
6988   if (code == CHARSET_INVALID_CODE (charset))
6989     error ("Can't encode by shift_jis encoding: %d", c);
6990   JIS_TO_SJIS (code);
6991
6992   return make_number (code);
6993 }
6994
6995 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6996        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6997 Return the corresponding character.  */)
6998      (code)
6999      Lisp_Object code;
7000 {
7001   Lisp_Object spec, attrs, val;
7002   struct charset *charset_roman, *charset_big5, *charset;
7003   int c;
7004
7005   CHECK_NATNUM (code);
7006   c = XFASTINT (code);
7007   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7008   attrs = AREF (spec, 0);
7009
7010   if (ASCII_BYTE_P (c)
7011       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7012     return code;
7013
7014   val = CODING_ATTR_CHARSET_LIST (attrs);
7015   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7016   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7017
7018   if (c <= 0x7F)
7019     charset = charset_roman;
7020   else
7021     {
7022       int b1 = c >> 8, b2 = c & 0x7F;
7023       if (b1 < 0xA1 || b1 > 0xFE
7024           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7025         error ("Invalid code: %d", code);
7026       charset = charset_big5;
7027     }
7028   c = DECODE_CHAR (charset, (unsigned )c);
7029   if (c < 0)
7030     error ("Invalid code: %d", code);
7031   return make_number (c);
7032 }
7033
7034 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7035        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7036 Return the corresponding character code in Big5.  */)
7037      (ch)
7038      Lisp_Object ch;
7039 {
7040   Lisp_Object spec, attrs, charset_list;
7041   struct charset *charset;
7042   int c;
7043   unsigned code;
7044
7045   CHECK_CHARACTER (ch);
7046   c = XFASTINT (ch);
7047   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7048   attrs = AREF (spec, 0);
7049   if (ASCII_CHAR_P (c)
7050       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7051     return ch;
7052
7053   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7054   charset = char_charset (c, charset_list, &code);
7055   if (code == CHARSET_INVALID_CODE (charset))
7056     error ("Can't encode by Big5 encoding: %d", c);
7057
7058   return make_number (code);
7059 }
7060
7061 \f
7062 DEFUN ("set-terminal-coding-system-internal",
7063        Fset_terminal_coding_system_internal,
7064        Sset_terminal_coding_system_internal, 1, 1, 0,
7065        doc: /* Internal use only.  */)
7066      (coding_system)
7067      Lisp_Object coding_system;
7068 {
7069   CHECK_SYMBOL (coding_system);
7070   setup_coding_system (Fcheck_coding_system (coding_system),
7071                         &terminal_coding);
7072
7073   /* We had better not send unsafe characters to terminal.  */
7074   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7075   /* Characer composition should be disabled.  */
7076   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7077   terminal_coding.src_multibyte = 1;
7078   terminal_coding.dst_multibyte = 0;
7079   return Qnil;
7080 }
7081
7082 DEFUN ("set-safe-terminal-coding-system-internal",
7083        Fset_safe_terminal_coding_system_internal,
7084        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7085        doc: /* Internal use only.  */)
7086      (coding_system)
7087      Lisp_Object coding_system;
7088 {
7089   CHECK_SYMBOL (coding_system);
7090   setup_coding_system (Fcheck_coding_system (coding_system),
7091                        &safe_terminal_coding);
7092   /* Characer composition should be disabled.  */
7093   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7094   safe_terminal_coding.src_multibyte = 1;
7095   safe_terminal_coding.dst_multibyte = 0;
7096   return Qnil;
7097 }
7098
7099 DEFUN ("terminal-coding-system",
7100        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7101        doc: /* Return coding system specified for terminal output.  */)
7102      ()
7103 {
7104   return CODING_ID_NAME (terminal_coding.id);
7105 }
7106
7107 DEFUN ("set-keyboard-coding-system-internal",
7108        Fset_keyboard_coding_system_internal,
7109        Sset_keyboard_coding_system_internal, 1, 1, 0,
7110        doc: /* Internal use only.  */)
7111      (coding_system)
7112      Lisp_Object coding_system;
7113 {
7114   CHECK_SYMBOL (coding_system);
7115   setup_coding_system (Fcheck_coding_system (coding_system),
7116                        &keyboard_coding);
7117   /* Characer composition should be disabled.  */
7118   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7119   return Qnil;
7120 }
7121
7122 DEFUN ("keyboard-coding-system",
7123        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7124        doc: /* Return coding system specified for decoding keyboard input.  */)
7125      ()
7126 {
7127   return CODING_ID_NAME (keyboard_coding.id);
7128 }
7129
7130 \f
7131 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7132        Sfind_operation_coding_system,  1, MANY, 0,
7133        doc: /* Choose a coding system for an operation based on the target name.
7134 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7135 DECODING-SYSTEM is the coding system to use for decoding
7136 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7137 for encoding (in case OPERATION does encoding).
7138
7139 The first argument OPERATION specifies an I/O primitive:
7140   For file I/O, `insert-file-contents' or `write-region'.
7141   For process I/O, `call-process', `call-process-region', or `start-process'.
7142   For network I/O, `open-network-stream'.
7143
7144 The remaining arguments should be the same arguments that were passed
7145 to the primitive.  Depending on which primitive, one of those arguments
7146 is selected as the TARGET.  For example, if OPERATION does file I/O,
7147 whichever argument specifies the file name is TARGET.
7148
7149 TARGET has a meaning which depends on OPERATION:
7150   For file I/O, TARGET is a file name.
7151   For process I/O, TARGET is a process name.
7152   For network I/O, TARGET is a service name or a port number
7153
7154 This function looks up what specified for TARGET in,
7155 `file-coding-system-alist', `process-coding-system-alist',
7156 or `network-coding-system-alist' depending on OPERATION.
7157 They may specify a coding system, a cons of coding systems,
7158 or a function symbol to call.
7159 In the last case, we call the function with one argument,
7160 which is a list of all the arguments given to this function.
7161
7162 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7163      (nargs, args)
7164      int nargs;
7165      Lisp_Object *args;
7166 {
7167   Lisp_Object operation, target_idx, target, val;
7168   register Lisp_Object chain;
7169
7170   if (nargs < 2)
7171     error ("Too few arguments");
7172   operation = args[0];
7173   if (!SYMBOLP (operation)
7174       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7175     error ("Invalid first arguement");
7176   if (nargs < 1 + XINT (target_idx))
7177     error ("Too few arguments for operation: %s",
7178            XSYMBOL (operation)->name->data);
7179   target = args[XINT (target_idx) + 1];
7180   if (!(STRINGP (target)
7181         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7182     error ("Invalid %dth argument", XINT (target_idx) + 1);
7183
7184   chain = ((EQ (operation, Qinsert_file_contents)
7185             || EQ (operation, Qwrite_region))
7186            ? Vfile_coding_system_alist
7187            : (EQ (operation, Qopen_network_stream)
7188               ? Vnetwork_coding_system_alist
7189               : Vprocess_coding_system_alist));
7190   if (NILP (chain))
7191     return Qnil;
7192
7193   for (; CONSP (chain); chain = XCDR (chain))
7194     {
7195       Lisp_Object elt;
7196
7197       elt = XCAR (chain);
7198       if (CONSP (elt)
7199           && ((STRINGP (target)
7200                && STRINGP (XCAR (elt))
7201                && fast_string_match (XCAR (elt), target) >= 0)
7202               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7203         {
7204           val = XCDR (elt);
7205           /* Here, if VAL is both a valid coding system and a valid
7206              function symbol, we return VAL as a coding system.  */
7207           if (CONSP (val))
7208             return val;
7209           if (! SYMBOLP (val))
7210             return Qnil;
7211           if (! NILP (Fcoding_system_p (val)))
7212             return Fcons (val, val);
7213           if (! NILP (Ffboundp (val)))
7214             {
7215               val = call1 (val, Flist (nargs, args));
7216               if (CONSP (val))
7217                 return val;
7218               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7219                 return Fcons (val, val);
7220             }
7221           return Qnil;
7222         }
7223     }
7224   return Qnil;
7225 }
7226
7227 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7228        Sset_coding_system_priority, 1, MANY, 0,
7229        doc: /* Assign higher priority to the coding systems given as arguments.
7230 usage: (set-coding-system-priority CODING-SYSTEM ...)  */)
7231      (nargs, args)
7232      int nargs;
7233      Lisp_Object *args;
7234 {
7235   int i, j;
7236   int changed[coding_category_max];
7237   enum coding_category priorities[coding_category_max];
7238
7239   bzero (changed, sizeof changed);
7240
7241   for (i = j = 0; i < nargs; i++)
7242     {
7243       enum coding_category category;
7244       Lisp_Object spec, attrs;
7245
7246       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7247       attrs = AREF (spec, 0);
7248       category = XINT (CODING_ATTR_CATEGORY (attrs));
7249       if (changed[category])
7250         /* Ignore this coding system because a coding system of the
7251            same category already had a higher priority.  */
7252         continue;
7253       changed[category] = 1;
7254       priorities[j++] = category;
7255       if (coding_categories[category].id >= 0
7256           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7257         setup_coding_system (args[i], &coding_categories[category]);
7258     }
7259
7260   /* Now we have decided top J priorities.  Reflect the order of the
7261      original priorities to the remaining priorities.  */
7262
7263   for (i = j, j = 0; i < coding_category_max; i++, j++)
7264     {
7265       while (j < coding_category_max
7266              && changed[coding_priorities[j]])
7267         j++;
7268       if (j == coding_category_max)
7269         abort ();
7270       priorities[i] = coding_priorities[j];
7271     }
7272
7273   bcopy (priorities, coding_priorities, sizeof priorities);
7274   return Qnil;
7275 }
7276
7277 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7278        Scoding_system_priority_list, 0, 1, 0,
7279        doc: /* Return a list of coding systems ordered by their priorities.
7280 HIGHESTP non-nil means just return the highest priority one.  */)
7281      (highestp)
7282      Lisp_Object highestp;
7283 {
7284   int i;
7285   Lisp_Object val;
7286
7287   for (i = 0, val = Qnil; i < coding_category_max; i++)
7288     {
7289       enum coding_category category = coding_priorities[i];
7290       int id = coding_categories[category].id;
7291       Lisp_Object attrs;
7292
7293       if (id < 0)
7294         continue;
7295       attrs = CODING_ID_ATTRS (id);
7296       if (! NILP (highestp))
7297         return CODING_ATTR_BASE_NAME (attrs);
7298       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7299     }
7300   return Fnreverse (val);
7301 }
7302
7303 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7304
7305 static Lisp_Object
7306 make_subsidiaries (base)
7307      Lisp_Object base;
7308 {
7309   Lisp_Object subsidiaries;
7310   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7311   char *buf = (char *) alloca (base_name_len + 6);
7312   int i;
7313
7314   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7315   subsidiaries = Fmake_vector (make_number (3), Qnil);
7316   for (i = 0; i < 3; i++)
7317     {
7318       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7319       ASET (subsidiaries, i, intern (buf));
7320     }
7321   return subsidiaries;
7322 }
7323
7324
7325 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7326        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7327        doc: /* For internal use only.
7328 usage: (define-coding-system-internal ...)  */)
7329      (nargs, args)
7330      int nargs;
7331      Lisp_Object *args;
7332 {
7333   Lisp_Object name;
7334   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7335   Lisp_Object attrs;            /* Vector of attributes.  */
7336   Lisp_Object eol_type;
7337   Lisp_Object aliases;
7338   Lisp_Object coding_type, charset_list, safe_charsets;
7339   enum coding_category category;
7340   Lisp_Object tail, val;
7341   int max_charset_id = 0;
7342   int i;
7343
7344   if (nargs < coding_arg_max)
7345     goto short_args;
7346
7347   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7348
7349   name = args[coding_arg_name];
7350   CHECK_SYMBOL (name);
7351   CODING_ATTR_BASE_NAME (attrs) = name;
7352
7353   val = args[coding_arg_mnemonic];
7354   if (! STRINGP (val))
7355     CHECK_CHARACTER (val);
7356   CODING_ATTR_MNEMONIC (attrs) = val;
7357
7358   coding_type = args[coding_arg_coding_type];
7359   CHECK_SYMBOL (coding_type);
7360   CODING_ATTR_TYPE (attrs) = coding_type;
7361
7362   charset_list = args[coding_arg_charset_list];
7363   if (SYMBOLP (charset_list))
7364     {
7365       if (EQ (charset_list, Qiso_2022))
7366         {
7367           if (! EQ (coding_type, Qiso_2022))
7368             error ("Invalid charset-list");
7369           charset_list = Viso_2022_charset_list;
7370         }
7371       else if (EQ (charset_list, Qemacs_mule))
7372         {
7373           if (! EQ (coding_type, Qemacs_mule))
7374             error ("Invalid charset-list");
7375           charset_list = Vemacs_mule_charset_list;
7376         }
7377       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7378         if (max_charset_id < XFASTINT (XCAR (tail)))
7379           max_charset_id = XFASTINT (XCAR (tail));
7380     }
7381   else
7382     {
7383       charset_list = Fcopy_sequence (charset_list);
7384       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7385         {
7386           struct charset *charset;
7387
7388           val = Fcar (tail);
7389           CHECK_CHARSET_GET_CHARSET (val, charset);
7390           if (EQ (coding_type, Qiso_2022)
7391               ? CHARSET_ISO_FINAL (charset) < 0
7392               : EQ (coding_type, Qemacs_mule)
7393               ? CHARSET_EMACS_MULE_ID (charset) < 0
7394               : 0)
7395             error ("Can't handle charset `%s'",
7396                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7397
7398           XCAR (tail) = make_number (charset->id);
7399           if (max_charset_id < charset->id)
7400             max_charset_id = charset->id;
7401         }
7402     }
7403   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7404
7405   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7406                                 make_number (255));
7407   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7408     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7409   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7410
7411   val = args[coding_arg_decode_translation_table];
7412   if (! NILP (val))
7413     CHECK_CHAR_TABLE (val);
7414   CODING_ATTR_DECODE_TBL (attrs) = val;
7415
7416   val = args[coding_arg_encode_translation_table];
7417   if (! NILP (val))
7418     CHECK_CHAR_TABLE (val);
7419   CODING_ATTR_ENCODE_TBL (attrs) = val;
7420
7421   val = args[coding_arg_post_read_conversion];
7422   CHECK_SYMBOL (val);
7423   CODING_ATTR_POST_READ (attrs) = val;
7424
7425   val = args[coding_arg_pre_write_conversion];
7426   CHECK_SYMBOL (val);
7427   CODING_ATTR_PRE_WRITE (attrs) = val;
7428
7429   val = args[coding_arg_default_char];
7430   if (NILP (val))
7431     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7432   else
7433     {
7434       CHECK_CHARACTER (val);
7435       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7436     }
7437
7438   val = args[coding_arg_plist];
7439   CHECK_LIST (val);
7440   CODING_ATTR_PLIST (attrs) = val;
7441
7442   if (EQ (coding_type, Qcharset))
7443     {
7444       /* Generate a lisp vector of 256 elements.  Each element is nil,
7445          integer, or a list of charset IDs.
7446
7447          If Nth element is nil, the byte code N is invalid in this
7448          coding system.
7449
7450          If Nth element is a number NUM, N is the first byte of a
7451          charset whose ID is NUM.
7452
7453          If Nth element is a list of charset IDs, N is the first byte
7454          of one of them.  The list is sorted by dimensions of the
7455          charsets.  A charset of smaller dimension comes firtst.
7456       */
7457       val = Fmake_vector (make_number (256), Qnil);
7458
7459       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7460         {
7461           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7462           int dim = CHARSET_DIMENSION (charset);
7463           int idx = (dim - 1) * 4;
7464
7465           for (i = charset->code_space[idx];
7466                i <= charset->code_space[idx + 1]; i++)
7467             {
7468               Lisp_Object tmp, tmp2;
7469               int dim2;
7470
7471               tmp = AREF (val, i);
7472               if (NILP (tmp))
7473                 tmp = XCAR (tail);
7474               else if (NUMBERP (tmp))
7475                 {
7476                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7477                   if (dim < dim2)
7478                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7479                   else
7480                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7481                 }
7482               else
7483                 {
7484                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7485                     {
7486                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7487                       if (dim < dim2)
7488                         break;
7489                     }
7490                   if (NILP (tmp2))
7491                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7492                   else
7493                     {
7494                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7495                       XSETCAR (tmp2, XCAR (tail));
7496                     }
7497                 }
7498               ASET (val, i, tmp);
7499             }
7500         }
7501       ASET (attrs, coding_attr_charset_valids, val);
7502       category = coding_category_charset;
7503     }
7504   else if (EQ (coding_type, Qccl))
7505     {
7506       Lisp_Object valids;
7507
7508       if (nargs < coding_arg_ccl_max)
7509         goto short_args;
7510
7511       val = args[coding_arg_ccl_decoder];
7512       CHECK_CCL_PROGRAM (val);
7513       if (VECTORP (val))
7514         val = Fcopy_sequence (val);
7515       ASET (attrs, coding_attr_ccl_decoder, val);
7516
7517       val = args[coding_arg_ccl_encoder];
7518       CHECK_CCL_PROGRAM (val);
7519       if (VECTORP (val))
7520         val = Fcopy_sequence (val);
7521       ASET (attrs, coding_attr_ccl_encoder, val);
7522
7523       val = args[coding_arg_ccl_valids];
7524       valids = Fmake_string (make_number (256), make_number (0));
7525       for (tail = val; !NILP (tail); tail = Fcdr (tail))
7526         {
7527           val = Fcar (tail);
7528           if (INTEGERP (val))
7529             ASET (valids, XINT (val), 1);
7530           else
7531             {
7532               int from, to;
7533
7534               CHECK_CONS (val);
7535               CHECK_NUMBER (XCAR (val));
7536               CHECK_NUMBER (XCDR (val));
7537               from = XINT (XCAR (val));
7538               to = XINT (XCDR (val));
7539               for (i = from; i <= to; i++)
7540                 ASET (valids, i, 1);
7541             }
7542         }
7543       ASET (attrs, coding_attr_ccl_valids, valids);
7544
7545       category = coding_category_ccl;
7546     }
7547   else if (EQ (coding_type, Qutf_16))
7548     {
7549       Lisp_Object bom, endian;
7550
7551       if (nargs < coding_arg_utf16_max)
7552         goto short_args;
7553
7554       bom = args[coding_arg_utf16_bom];
7555       if (! NILP (bom) && ! EQ (bom, Qt))
7556         {
7557           CHECK_CONS (bom);
7558           CHECK_CODING_SYSTEM (XCAR (bom));
7559           CHECK_CODING_SYSTEM (XCDR (bom));
7560         }
7561       ASET (attrs, coding_attr_utf_16_bom, bom);
7562
7563       endian = args[coding_arg_utf16_endian];
7564       ASET (attrs, coding_attr_utf_16_endian, endian);
7565
7566       category = (CONSP (bom)
7567                   ? coding_category_utf_16_auto
7568                   : NILP (bom)
7569                   ? (NILP (endian)
7570                      ? coding_category_utf_16_be_nosig
7571                      : coding_category_utf_16_le_nosig)
7572                   : (NILP (endian)
7573                      ? coding_category_utf_16_be
7574                      : coding_category_utf_16_le));
7575     }
7576   else if (EQ (coding_type, Qiso_2022))
7577     {
7578       Lisp_Object initial, reg_usage, request, flags;
7579       int i, id;
7580
7581       if (nargs < coding_arg_iso2022_max)
7582         goto short_args;
7583
7584       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7585       CHECK_VECTOR (initial);
7586       for (i = 0; i < 4; i++)
7587         {
7588           val = Faref (initial, make_number (i));
7589           if (! NILP (val))
7590             {
7591               CHECK_CHARSET_GET_ID (val, id);
7592               ASET (initial, i, make_number (id));
7593             }
7594           else
7595             ASET (initial, i, make_number (-1));
7596         }
7597
7598       reg_usage = args[coding_arg_iso2022_reg_usage];
7599       CHECK_CONS (reg_usage);
7600       CHECK_NATNUM (XCAR (reg_usage));
7601       CHECK_NATNUM (XCDR (reg_usage));
7602
7603       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7604       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7605         {
7606           int id;
7607
7608           val = Fcar (tail);
7609           CHECK_CONS (val);
7610           CHECK_CHARSET_GET_ID (XCAR (val), id);
7611           CHECK_NATNUM (XCDR (val));
7612           if (XINT (XCDR (val)) >= 4)
7613             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7614           XCAR (val) = make_number (id);
7615         }
7616
7617       flags = args[coding_arg_iso2022_flags];
7618       CHECK_NATNUM (flags);
7619       i = XINT (flags);
7620       if (EQ (args[coding_arg_charset_list], Qiso_2022))
7621         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7622
7623       ASET (attrs, coding_attr_iso_initial, initial);
7624       ASET (attrs, coding_attr_iso_usage, reg_usage);
7625       ASET (attrs, coding_attr_iso_request, request);
7626       ASET (attrs, coding_attr_iso_flags, flags);
7627       setup_iso_safe_charsets (attrs);
7628
7629       if (i & CODING_ISO_FLAG_SEVEN_BITS)
7630         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7631                           | CODING_ISO_FLAG_SINGLE_SHIFT))
7632                     ? coding_category_iso_7_else
7633                     : EQ (args[coding_arg_charset_list], Qiso_2022)
7634                     ? coding_category_iso_7
7635                     : coding_category_iso_7_tight);
7636       else
7637         {
7638           int id = XINT (AREF (initial, 1));
7639
7640           category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7641                              | CODING_ISO_FLAG_SINGLE_SHIFT))
7642                        || EQ (args[coding_arg_charset_list], Qiso_2022)
7643                        || id < 0)
7644                       ? coding_category_iso_8_else
7645                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7646                       ? coding_category_iso_8_1
7647                       : coding_category_iso_8_2);
7648         }
7649     }
7650   else if (EQ (coding_type, Qemacs_mule))
7651     {
7652       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7653         ASET (attrs, coding_attr_emacs_mule_full, Qt);
7654
7655       category = coding_category_emacs_mule;
7656     }
7657   else if (EQ (coding_type, Qshift_jis))
7658     {
7659
7660       struct charset *charset;
7661
7662       if (XINT (Flength (charset_list)) != 3)
7663         error ("There should be just three charsets");
7664
7665       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7666       if (CHARSET_DIMENSION (charset) != 1)
7667         error ("Dimension of charset %s is not one",
7668                XSYMBOL (CHARSET_NAME (charset))->name->data);
7669
7670       charset_list = XCDR (charset_list);
7671       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7672       if (CHARSET_DIMENSION (charset) != 1)
7673         error ("Dimension of charset %s is not one",
7674                XSYMBOL (CHARSET_NAME (charset))->name->data);
7675
7676       charset_list = XCDR (charset_list);
7677       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7678       if (CHARSET_DIMENSION (charset) != 2)
7679         error ("Dimension of charset %s is not two",
7680                XSYMBOL (CHARSET_NAME (charset))->name->data);
7681
7682       category = coding_category_sjis;
7683       Vsjis_coding_system = name;
7684     }
7685   else if (EQ (coding_type, Qbig5))
7686     {
7687       struct charset *charset;
7688
7689       if (XINT (Flength (charset_list)) != 2)
7690         error ("There should be just two charsets");
7691
7692       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7693       if (CHARSET_DIMENSION (charset) != 1)
7694         error ("Dimension of charset %s is not one",
7695                XSYMBOL (CHARSET_NAME (charset))->name->data);
7696
7697       charset_list = XCDR (charset_list);
7698       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7699       if (CHARSET_DIMENSION (charset) != 2)
7700         error ("Dimension of charset %s is not two",
7701                XSYMBOL (CHARSET_NAME (charset))->name->data);
7702
7703       category = coding_category_big5;
7704       Vbig5_coding_system = name;
7705     }
7706   else if (EQ (coding_type, Qraw_text))
7707     category = coding_category_raw_text;
7708   else if (EQ (coding_type, Qutf_8))
7709     category = coding_category_utf_8;
7710   else if (EQ (coding_type, Qundecided))
7711     category = coding_category_undecided;
7712   else
7713     error ("Invalid coding system type: %s",
7714            XSYMBOL (coding_type)->name->data);
7715
7716   CODING_ATTR_CATEGORY (attrs) = make_number (category);
7717
7718   eol_type = args[coding_arg_eol_type];
7719   if (! NILP (eol_type)
7720       && ! EQ (eol_type, Qunix)
7721       && ! EQ (eol_type, Qdos)
7722       && ! EQ (eol_type, Qmac))
7723     error ("Invalid eol-type");
7724
7725   aliases = Fcons (name, Qnil);
7726
7727   if (NILP (eol_type))
7728     {
7729       eol_type = make_subsidiaries (name);
7730       for (i = 0; i < 3; i++)
7731         {
7732           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7733
7734           this_name = AREF (eol_type, i);
7735           this_aliases = Fcons (this_name, Qnil);
7736           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7737           this_spec = Fmake_vector (make_number (3), attrs);
7738           ASET (this_spec, 1, this_aliases);
7739           ASET (this_spec, 2, this_eol_type);
7740           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7741           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7742           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7743                                         Vcoding_system_alist);
7744         }
7745     }
7746
7747   spec_vec = Fmake_vector (make_number (3), attrs);
7748   ASET (spec_vec, 1, aliases);
7749   ASET (spec_vec, 2, eol_type);
7750
7751   Fputhash (name, spec_vec, Vcoding_system_hash_table);
7752   Vcoding_system_list = Fcons (name, Vcoding_system_list);
7753   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7754                                 Vcoding_system_alist);
7755
7756   {
7757     int id = coding_categories[category].id;
7758
7759     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7760       setup_coding_system (name, &coding_categories[category]);
7761   }
7762
7763   return Qnil;
7764
7765  short_args:
7766   return Fsignal (Qwrong_number_of_arguments,
7767                   Fcons (intern ("define-coding-system-internal"),
7768                          make_number (nargs)));
7769 }
7770
7771 /* Fixme: should this record the alias relationships for
7772    diagnostics?  */
7773 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7774        Sdefine_coding_system_alias, 2, 2, 0,
7775        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
7776      (alias, coding_system)
7777      Lisp_Object alias, coding_system;
7778 {
7779   Lisp_Object spec, aliases, eol_type;
7780
7781   CHECK_SYMBOL (alias);
7782   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7783   aliases = AREF (spec, 1);
7784   while (!NILP (XCDR (aliases)))
7785     aliases = XCDR (aliases);
7786   XCDR (aliases) = Fcons (alias, Qnil);
7787
7788   eol_type = AREF (spec, 2);
7789   if (VECTORP (eol_type))
7790     {
7791       Lisp_Object subsidiaries;
7792       int i;
7793
7794       subsidiaries = make_subsidiaries (alias);
7795       for (i = 0; i < 3; i++)
7796         Fdefine_coding_system_alias (AREF (subsidiaries, i),
7797                                      AREF (eol_type, i));
7798
7799       ASET (spec, 2, subsidiaries);
7800     }
7801
7802   Fputhash (alias, spec, Vcoding_system_hash_table);
7803   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7804                                 Vcoding_system_alist);
7805
7806   return Qnil;
7807 }
7808
7809 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7810        1, 1, 0,
7811        doc: /* Return the base of CODING-SYSTEM.
7812 Any alias or subsidiary coding system is not a base coding system.  */)
7813   (coding_system)
7814      Lisp_Object coding_system;
7815 {
7816   Lisp_Object spec, attrs;
7817
7818   if (NILP (coding_system))
7819     return (Qno_conversion);
7820   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7821   attrs = AREF (spec, 0);
7822   return CODING_ATTR_BASE_NAME (attrs);
7823 }
7824
7825 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7826        1, 1, 0,
7827        doc: "Return the property list of CODING-SYSTEM.")
7828      (coding_system)
7829      Lisp_Object coding_system;
7830 {
7831   Lisp_Object spec, attrs;
7832
7833   if (NILP (coding_system))
7834     coding_system = Qno_conversion;
7835   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7836   attrs = AREF (spec, 0);
7837   return CODING_ATTR_PLIST (attrs);
7838 }
7839
7840
7841 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7842        1, 1, 0,
7843        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
7844      (coding_system)
7845      Lisp_Object coding_system;
7846 {
7847   Lisp_Object spec;
7848
7849   if (NILP (coding_system))
7850     coding_system = Qno_conversion;
7851   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7852   return AREF (spec, 1);
7853 }
7854
7855 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7856        Scoding_system_eol_type, 1, 1, 0,
7857        doc: /* Return eol-type of CODING-SYSTEM.
7858 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7859
7860 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7861 and CR respectively.
7862
7863 A vector value indicates that a format of end-of-line should be
7864 detected automatically.  Nth element of the vector is the subsidiary
7865 coding system whose eol-type is N.  */)
7866      (coding_system)
7867      Lisp_Object coding_system;
7868 {
7869   Lisp_Object spec, eol_type;
7870   int n;
7871
7872   if (NILP (coding_system))
7873     coding_system = Qno_conversion;
7874   if (! CODING_SYSTEM_P (coding_system))
7875     return Qnil;
7876   spec = CODING_SYSTEM_SPEC (coding_system);
7877   eol_type = AREF (spec, 2);
7878   if (VECTORP (eol_type))
7879     return Fcopy_sequence (eol_type);
7880   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7881   return make_number (n);
7882 }
7883
7884 #endif /* emacs */
7885
7886 \f
7887 /*** 9. Post-amble ***/
7888
7889 void
7890 init_coding_once ()
7891 {
7892   int i;
7893
7894   for (i = 0; i < coding_category_max; i++)
7895     {
7896       coding_categories[i].id = -1;
7897       coding_priorities[i] = i;
7898     }
7899
7900   /* ISO2022 specific initialize routine.  */
7901   for (i = 0; i < 0x20; i++)
7902     iso_code_class[i] = ISO_control_0;
7903   for (i = 0x21; i < 0x7F; i++)
7904     iso_code_class[i] = ISO_graphic_plane_0;
7905   for (i = 0x80; i < 0xA0; i++)
7906     iso_code_class[i] = ISO_control_1;
7907   for (i = 0xA1; i < 0xFF; i++)
7908     iso_code_class[i] = ISO_graphic_plane_1;
7909   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7910   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7911   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7912   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7913   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7914   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7915   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7916   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7917   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7918   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7919
7920   inhibit_pre_post_conversion = 0;
7921
7922   for (i = 0; i < 256; i++)
7923     {
7924       emacs_mule_bytes[i] = 1;
7925     }
7926   emacs_mule_bytes[LEADING_CODE_PRIVATE_11] = 3;
7927   emacs_mule_bytes[LEADING_CODE_PRIVATE_12] = 3;
7928   emacs_mule_bytes[LEADING_CODE_PRIVATE_21] = 4;
7929   emacs_mule_bytes[LEADING_CODE_PRIVATE_22] = 4;
7930 }
7931
7932 #ifdef emacs
7933
7934 void
7935 syms_of_coding ()
7936 {
7937   staticpro (&Vcoding_system_hash_table);
7938   Vcoding_system_hash_table = Fmakehash (Qeq);
7939
7940   staticpro (&Vsjis_coding_system);
7941   Vsjis_coding_system = Qnil;
7942
7943   staticpro (&Vbig5_coding_system);
7944   Vbig5_coding_system = Qnil;
7945
7946   staticpro (&Vcode_conversion_work_buf_list);
7947   Vcode_conversion_work_buf_list = Qnil;
7948
7949   staticpro (&Vcode_conversion_reused_work_buf);
7950   Vcode_conversion_reused_work_buf = Qnil;
7951
7952   DEFSYM (Qcharset, "charset");
7953   DEFSYM (Qtarget_idx, "target-idx");
7954   DEFSYM (Qcoding_system_history, "coding-system-history");
7955   Fset (Qcoding_system_history, Qnil);
7956
7957   /* Target FILENAME is the first argument.  */
7958   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7959   /* Target FILENAME is the third argument.  */
7960   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7961
7962   DEFSYM (Qcall_process, "call-process");
7963   /* Target PROGRAM is the first argument.  */
7964   Fput (Qcall_process, Qtarget_idx, make_number (0));
7965
7966   DEFSYM (Qcall_process_region, "call-process-region");
7967   /* Target PROGRAM is the third argument.  */
7968   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7969
7970   DEFSYM (Qstart_process, "start-process");
7971   /* Target PROGRAM is the third argument.  */
7972   Fput (Qstart_process, Qtarget_idx, make_number (2));
7973
7974   DEFSYM (Qopen_network_stream, "open-network-stream");
7975   /* Target SERVICE is the fourth argument.  */
7976   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7977
7978   DEFSYM (Qcoding_system, "coding-system");
7979   DEFSYM (Qcoding_aliases, "coding-aliases");
7980
7981   DEFSYM (Qeol_type, "eol-type");
7982   DEFSYM (Qunix, "unix");
7983   DEFSYM (Qdos, "dos");
7984
7985   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7986   DEFSYM (Qpost_read_conversion, "post-read-conversion");
7987   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7988   DEFSYM (Qdefault_char, "default-char");
7989   DEFSYM (Qundecided, "undecided");
7990   DEFSYM (Qno_conversion, "no-conversion");
7991   DEFSYM (Qraw_text, "raw-text");
7992
7993   DEFSYM (Qiso_2022, "iso-2022");
7994
7995   DEFSYM (Qutf_8, "utf-8");
7996
7997   DEFSYM (Qutf_16, "utf-16");
7998   DEFSYM (Qutf_16_be, "utf-16-be");
7999   DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
8000   DEFSYM (Qutf_16_le, "utf-16-l3");
8001   DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
8002   DEFSYM (Qsignature, "signature");
8003   DEFSYM (Qendian, "endian");
8004   DEFSYM (Qbig, "big");
8005   DEFSYM (Qlittle, "little");
8006
8007   DEFSYM (Qshift_jis, "shift-jis");
8008   DEFSYM (Qbig5, "big5");
8009
8010   DEFSYM (Qcoding_system_p, "coding-system-p");
8011
8012   DEFSYM (Qcoding_system_error, "coding-system-error");
8013   Fput (Qcoding_system_error, Qerror_conditions,
8014         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8015   Fput (Qcoding_system_error, Qerror_message,
8016         build_string ("Invalid coding system"));
8017
8018   /* Intern this now in case it isn't already done.
8019      Setting this variable twice is harmless.
8020      But don't staticpro it here--that is done in alloc.c.  */
8021   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8022
8023   DEFSYM (Qtranslation_table, "translation-table");
8024   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8025   DEFSYM (Qtranslation_table_id, "translation-table-id");
8026   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8027   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8028
8029   DEFSYM (Qvalid_codes, "valid-codes");
8030
8031   DEFSYM (Qemacs_mule, "emacs-mule");
8032
8033   Vcoding_category_table
8034     = Fmake_vector (make_number (coding_category_max), Qnil);
8035   staticpro (&Vcoding_category_table);
8036   /* Followings are target of code detection.  */
8037   ASET (Vcoding_category_table, coding_category_iso_7,
8038         intern ("coding-category-iso-7"));
8039   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8040         intern ("coding-category-iso-7-tight"));
8041   ASET (Vcoding_category_table, coding_category_iso_8_1,
8042         intern ("coding-category-iso-8-1"));
8043   ASET (Vcoding_category_table, coding_category_iso_8_2,
8044         intern ("coding-category-iso-8-2"));
8045   ASET (Vcoding_category_table, coding_category_iso_7_else,
8046         intern ("coding-category-iso-7-else"));
8047   ASET (Vcoding_category_table, coding_category_iso_8_else,
8048         intern ("coding-category-iso-8-else"));
8049   ASET (Vcoding_category_table, coding_category_utf_8,
8050         intern ("coding-category-utf-8"));
8051   ASET (Vcoding_category_table, coding_category_utf_16_be,
8052         intern ("coding-category-utf-16-be"));
8053   ASET (Vcoding_category_table, coding_category_utf_16_le,
8054         intern ("coding-category-utf-16-le"));
8055   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8056         intern ("coding-category-utf-16-be-nosig"));
8057   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8058         intern ("coding-category-utf-16-le-nosig"));
8059   ASET (Vcoding_category_table, coding_category_charset,
8060         intern ("coding-category-charset"));
8061   ASET (Vcoding_category_table, coding_category_sjis,
8062         intern ("coding-category-sjis"));
8063   ASET (Vcoding_category_table, coding_category_big5,
8064         intern ("coding-category-big5"));
8065   ASET (Vcoding_category_table, coding_category_ccl,
8066         intern ("coding-category-ccl"));
8067   ASET (Vcoding_category_table, coding_category_emacs_mule,
8068         intern ("coding-category-emacs-mule"));
8069   /* Followings are NOT target of code detection.  */
8070   ASET (Vcoding_category_table, coding_category_raw_text,
8071         intern ("coding-category-raw-text"));
8072   ASET (Vcoding_category_table, coding_category_undecided,
8073         intern ("coding-category-undecided"));
8074
8075   defsubr (&Scoding_system_p);
8076   defsubr (&Sread_coding_system);
8077   defsubr (&Sread_non_nil_coding_system);
8078   defsubr (&Scheck_coding_system);
8079   defsubr (&Sdetect_coding_region);
8080   defsubr (&Sdetect_coding_string);
8081   defsubr (&Sfind_coding_systems_region_internal);
8082   defsubr (&Scheck_coding_systems_region);
8083   defsubr (&Sdecode_coding_region);
8084   defsubr (&Sencode_coding_region);
8085   defsubr (&Sdecode_coding_string);
8086   defsubr (&Sencode_coding_string);
8087   defsubr (&Sdecode_sjis_char);
8088   defsubr (&Sencode_sjis_char);
8089   defsubr (&Sdecode_big5_char);
8090   defsubr (&Sencode_big5_char);
8091   defsubr (&Sset_terminal_coding_system_internal);
8092   defsubr (&Sset_safe_terminal_coding_system_internal);
8093   defsubr (&Sterminal_coding_system);
8094   defsubr (&Sset_keyboard_coding_system_internal);
8095   defsubr (&Skeyboard_coding_system);
8096   defsubr (&Sfind_operation_coding_system);
8097   defsubr (&Sset_coding_system_priority);
8098   defsubr (&Sdefine_coding_system_internal);
8099   defsubr (&Sdefine_coding_system_alias);
8100   defsubr (&Scoding_system_base);
8101   defsubr (&Scoding_system_plist);
8102   defsubr (&Scoding_system_aliases);
8103   defsubr (&Scoding_system_eol_type);
8104   defsubr (&Scoding_system_priority_list);
8105
8106   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8107                doc: /* List of coding systems.
8108
8109 Do not alter the value of this variable manually.  This variable should be
8110 updated by the functions `define-coding-system' and
8111 `define-coding-system-alias'.  */);
8112   Vcoding_system_list = Qnil;
8113
8114   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8115                doc: /* Alist of coding system names.
8116 Each element is one element list of coding system name.
8117 This variable is given to `completing-read' as TABLE argument.
8118
8119 Do not alter the value of this variable manually.  This variable should be
8120 updated by the functions `make-coding-system' and
8121 `define-coding-system-alias'.  */);
8122   Vcoding_system_alist = Qnil;
8123
8124   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8125                doc: /* List of coding-categories (symbols) ordered by priority.
8126
8127 On detecting a coding system, Emacs tries code detection algorithms
8128 associated with each coding-category one by one in this order.  When
8129 one algorithm agrees with a byte sequence of source text, the coding
8130 system bound to the corresponding coding-category is selected.  */);
8131   {
8132     int i;
8133
8134     Vcoding_category_list = Qnil;
8135     for (i = coding_category_max - 1; i >= 0; i--)
8136       Vcoding_category_list
8137         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8138                  Vcoding_category_list);
8139   }
8140
8141   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8142                doc: /* Specify the coding system for read operations.
8143 It is useful to bind this variable with `let', but do not set it globally.
8144 If the value is a coding system, it is used for decoding on read operation.
8145 If not, an appropriate element is used from one of the coding system alists:
8146 There are three such tables, `file-coding-system-alist',
8147 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8148   Vcoding_system_for_read = Qnil;
8149
8150   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8151                doc: /* Specify the coding system for write operations.
8152 Programs bind this variable with `let', but you should not set it globally.
8153 If the value is a coding system, it is used for encoding of output,
8154 when writing it to a file and when sending it to a file or subprocess.
8155
8156 If this does not specify a coding system, an appropriate element
8157 is used from one of the coding system alists:
8158 There are three such tables, `file-coding-system-alist',
8159 `process-coding-system-alist', and `network-coding-system-alist'.
8160 For output to files, if the above procedure does not specify a coding system,
8161 the value of `buffer-file-coding-system' is used.  */);
8162   Vcoding_system_for_write = Qnil;
8163
8164   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8165                doc: /*
8166 Coding system used in the latest file or process I/O.  */);
8167   Vlast_coding_system_used = Qnil;
8168
8169   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8170                doc: /*
8171 *Non-nil means always inhibit code conversion of end-of-line format.
8172 See info node `Coding Systems' and info node `Text and Binary' concerning
8173 such conversion.  */);
8174   inhibit_eol_conversion = 0;
8175
8176   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8177                doc: /*
8178 Non-nil means process buffer inherits coding system of process output.
8179 Bind it to t if the process output is to be treated as if it were a file
8180 read from some filesystem.  */);
8181   inherit_process_coding_system = 0;
8182
8183   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8184                doc: /*
8185 Alist to decide a coding system to use for a file I/O operation.
8186 The format is ((PATTERN . VAL) ...),
8187 where PATTERN is a regular expression matching a file name,
8188 VAL is a coding system, a cons of coding systems, or a function symbol.
8189 If VAL is a coding system, it is used for both decoding and encoding
8190 the file contents.
8191 If VAL is a cons of coding systems, the car part is used for decoding,
8192 and the cdr part is used for encoding.
8193 If VAL is a function symbol, the function must return a coding system
8194 or a cons of coding systems which are used as above.  The function gets
8195 the arguments with which `find-operation-coding-systems' was called.
8196
8197 See also the function `find-operation-coding-system'
8198 and the variable `auto-coding-alist'.  */);
8199   Vfile_coding_system_alist = Qnil;
8200
8201   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8202                doc: /*
8203 Alist to decide a coding system to use for a process I/O operation.
8204 The format is ((PATTERN . VAL) ...),
8205 where PATTERN is a regular expression matching a program name,
8206 VAL is a coding system, a cons of coding systems, or a function symbol.
8207 If VAL is a coding system, it is used for both decoding what received
8208 from the program and encoding what sent to the program.
8209 If VAL is a cons of coding systems, the car part is used for decoding,
8210 and the cdr part is used for encoding.
8211 If VAL is a function symbol, the function must return a coding system
8212 or a cons of coding systems which are used as above.
8213
8214 See also the function `find-operation-coding-system'.  */);
8215   Vprocess_coding_system_alist = Qnil;
8216
8217   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8218                doc: /*
8219 Alist to decide a coding system to use for a network I/O operation.
8220 The format is ((PATTERN . VAL) ...),
8221 where PATTERN is a regular expression matching a network service name
8222 or is a port number to connect to,
8223 VAL is a coding system, a cons of coding systems, or a function symbol.
8224 If VAL is a coding system, it is used for both decoding what received
8225 from the network stream and encoding what sent to the network stream.
8226 If VAL is a cons of coding systems, the car part is used for decoding,
8227 and the cdr part is used for encoding.
8228 If VAL is a function symbol, the function must return a coding system
8229 or a cons of coding systems which are used as above.
8230
8231 See also the function `find-operation-coding-system'.  */);
8232   Vnetwork_coding_system_alist = Qnil;
8233
8234   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8235                doc: /* Coding system to use with system messages.
8236 Also used for decoding keyboard input on X Window system.  */);
8237   Vlocale_coding_system = Qnil;
8238
8239   /* The eol mnemonics are reset in startup.el system-dependently.  */
8240   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8241                doc: /*
8242 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8243   eol_mnemonic_unix = build_string (":");
8244
8245   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8246                doc: /*
8247 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8248   eol_mnemonic_dos = build_string ("\\");
8249
8250   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8251                doc: /*
8252 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8253   eol_mnemonic_mac = build_string ("/");
8254
8255   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8256                doc: /*
8257 *String displayed in mode line when end-of-line format is not yet determined.  */);
8258   eol_mnemonic_undecided = build_string (":");
8259
8260   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8261                doc: /*
8262 *Non-nil enables character translation while encoding and decoding.  */);
8263   Venable_character_translation = Qt;
8264
8265   DEFVAR_LISP ("standard-translation-table-for-decode",
8266                &Vstandard_translation_table_for_decode,
8267                doc: /* Table for translating characters while decoding.  */);
8268   Vstandard_translation_table_for_decode = Qnil;
8269
8270   DEFVAR_LISP ("standard-translation-table-for-encode",
8271                &Vstandard_translation_table_for_encode,
8272                doc: /* Table for translating characters while encoding.  */);
8273   Vstandard_translation_table_for_encode = Qnil;
8274
8275   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8276                doc: /* Alist of charsets vs revision numbers.
8277 While encoding, if a charset (car part of an element) is found,
8278 designate it with the escape sequence identifying revision (cdr part
8279 of the element).  */);
8280   Vcharset_revision_table = Qnil;
8281
8282   DEFVAR_LISP ("default-process-coding-system",
8283                &Vdefault_process_coding_system,
8284                doc: /* Cons of coding systems used for process I/O by default.
8285 The car part is used for decoding a process output,
8286 the cdr part is used for encoding a text to be sent to a process.  */);
8287   Vdefault_process_coding_system = Qnil;
8288
8289   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8290                doc: /*
8291 Table of extra Latin codes in the range 128..159 (inclusive).
8292 This is a vector of length 256.
8293 If Nth element is non-nil, the existence of code N in a file
8294 \(or output of subprocess) doesn't prevent it to be detected as
8295 a coding system of ISO 2022 variant which has a flag
8296 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8297 or reading output of a subprocess.
8298 Only 128th through 159th elements has a meaning.  */);
8299   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8300
8301   DEFVAR_LISP ("select-safe-coding-system-function",
8302                &Vselect_safe_coding_system_function,
8303                doc: /*
8304 Function to call to select safe coding system for encoding a text.
8305
8306 If set, this function is called to force a user to select a proper
8307 coding system which can encode the text in the case that a default
8308 coding system used in each operation can't encode the text.
8309
8310 The default value is `select-safe-coding-system' (which see).  */);
8311   Vselect_safe_coding_system_function = Qnil;
8312
8313   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8314                &inhibit_iso_escape_detection,
8315                doc: /*
8316 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8317
8318 By default, on reading a file, Emacs tries to detect how the text is
8319 encoded.  This code detection is sensitive to escape sequences.  If
8320 the sequence is valid as ISO2022, the code is determined as one of
8321 the ISO2022 encodings, and the file is decoded by the corresponding
8322 coding system (e.g. `iso-2022-7bit').
8323
8324 However, there may be a case that you want to read escape sequences in
8325 a file as is.  In such a case, you can set this variable to non-nil.
8326 Then, as the code detection ignores any escape sequences, no file is
8327 detected as encoded in some ISO2022 encoding.  The result is that all
8328 escape sequences become visible in a buffer.
8329
8330 The default value is nil, and it is strongly recommended not to change
8331 it.  That is because many Emacs Lisp source files that contain
8332 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8333 in Emacs's distribution, and they won't be decoded correctly on
8334 reading if you suppress escape sequence detection.
8335
8336 The other way to read escape sequences in a file without decoding is
8337 to explicitly specify some coding system that doesn't use ISO2022's
8338 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8339   inhibit_iso_escape_detection = 0;
8340
8341   {
8342     Lisp_Object args[coding_arg_max];
8343     Lisp_Object plist[14];
8344     int i;
8345
8346     for (i = 0; i < coding_arg_max; i++)
8347       args[i] = Qnil;
8348
8349     plist[0] = intern (":name");
8350     plist[1] = args[coding_arg_name] = Qno_conversion;
8351     plist[2] = intern (":mnemonic");
8352     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8353     plist[4] = intern (":coding-type");
8354     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8355     plist[6] = intern (":ascii-compatible-p");
8356     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8357     plist[8] = intern (":default-char");
8358     plist[9] = args[coding_arg_default_char] = make_number (0);
8359     plist[10] = intern (":docstring");
8360     plist[11] = build_string ("Do no conversion.\n\
8361 \n\
8362 When you visit a file with this coding, the file is read into a\n\
8363 unibyte buffer as is, thus each byte of a file is treated as a\n\
8364 character.");
8365     plist[12] = intern (":eol-type");
8366     plist[13] = args[coding_arg_eol_type] = Qunix;
8367     args[coding_arg_plist] = Flist (14, plist);
8368     Fdefine_coding_system_internal (coding_arg_max, args);
8369   }
8370
8371   setup_coding_system (Qno_conversion, &keyboard_coding);
8372   setup_coding_system (Qno_conversion, &terminal_coding);
8373   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8374 }
8375
8376 char *
8377 emacs_strerror (error_number)
8378      int error_number;
8379 {
8380   char *str;
8381
8382   synchronize_system_messages_locale ();
8383   str = strerror (error_number);
8384
8385   if (! NILP (Vlocale_coding_system))
8386     {
8387       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8388                                                       Vlocale_coding_system,
8389                                                       0);
8390       str = (char *) XSTRING (dec)->data;
8391     }
8392
8393   return str;
8394 }
8395
8396 #endif /* emacs */