src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   Coding system is an object for a encoding mechanism that contains
  50   information about how to convert byte sequence to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from a
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the mechanism of encoding.  Here's a brief descrition about type.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by code converter defined for each
  75   character set.
  76
  77   o Old Emacs' internal format (emacs-mule)
  78
  79   The coding system adopted by an old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode a text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for a text containing raw eight-bit data.  Emacs
 112   treat each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How end-of-line of a text is encoded depends on a system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text characters encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g.  the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX.  Return 1 if the data contains
 148   a byte sequence which can be decoded into non-ASCII characters by
 149   the coding system.  Otherwize (i.e. the data contains only ASCII
 150   characters or invalid sequence) return 0.
 151
 152   It also resets some bits of an integer pointed by MASK.  The macros
 153   CATEGORY_MASK_XXX specifies each bit of this integer.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, mask)
 160      struct coding_system *coding;
 161      int *mask;
 162 {
 163   unsigned char *src = coding->source;
 164   unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int c;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175       /* Check if it conforms to XXX.  If not, break the loop.  */
 176     }
 177   /* As the data is invalid for XXX, reset a proper bits.  */
 178   *mask &= ~CODING_CATEGORY_XXX;
 179   return 0;
 180  no_more_source:
 181   /* The source exausted.  */
 182   if (!found)
 183     /* ASCII characters only. */
 184     return 0;
 185   /* Some data should be decoded into non-ASCII characters.  */
 186   *mask &= CODING_CATEGORY_XXX;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   unsigned char *src = coding->source + coding->consumed;
 211   unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf;
 218   int *charbuf_end = charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos, Qmac;
 307 Lisp_Object Qbuffer_file_coding_system;
 308 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 309 Lisp_Object Qdefault_char;
 310 Lisp_Object Qno_conversion, Qundecided;
 311 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 312 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
 313 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316
 317 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 318 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Vselect_safe_coding_system_function;
 323
 324 /* Mnemonic string for each format of end-of-line.  */
 325 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 326 /* Mnemonic string to indicate format of end-of-line is not yet
 327    decided.  */
 328 Lisp_Object eol_mnemonic_undecided;
 329
 330 #ifdef emacs
 331
 332 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 333
 334 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 335
 336 /* Coding system emacs-mule and raw-text are for converting only
 337    end-of-line format.  */
 338 Lisp_Object Qemacs_mule, Qraw_text;
 339
 340 /* Coding-systems are handed between Emacs Lisp programs and C internal
 341    routines by the following three variables.  */
 342 /* Coding-system for reading files and receiving data from process.  */
 343 Lisp_Object Vcoding_system_for_read;
 344 /* Coding-system for writing files and sending data to process.  */
 345 Lisp_Object Vcoding_system_for_write;
 346 /* Coding-system actually used in the latest I/O.  */
 347 Lisp_Object Vlast_coding_system_used;
 348
 349 /* A vector of length 256 which contains information about special
 350    Latin codes (especially for dealing with Microsoft codes).  */
 351 Lisp_Object Vlatin_extra_code_table;
 352
 353 /* Flag to inhibit code conversion of end-of-line format.  */
 354 int inhibit_eol_conversion;
 355
 356 /* Flag to inhibit ISO2022 escape sequence detection.  */
 357 int inhibit_iso_escape_detection;
 358
 359 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 360 int inherit_process_coding_system;
 361
 362 /* Coding system to be used to encode text for terminal display.  */
 363 struct coding_system terminal_coding;
 364
 365 /* Coding system to be used to encode text for terminal display when
 366    terminal coding system is nil.  */
 367 struct coding_system safe_terminal_coding;
 368
 369 /* Coding system of what is sent from terminal keyboard.  */
 370 struct coding_system keyboard_coding;
 371
 372 Lisp_Object Vfile_coding_system_alist;
 373 Lisp_Object Vprocess_coding_system_alist;
 374 Lisp_Object Vnetwork_coding_system_alist;
 375
 376 Lisp_Object Vlocale_coding_system;
 377
 378 #endif /* emacs */
 379
 380 /* Flag to tell if we look up translation table on character code
 381    conversion.  */
 382 Lisp_Object Venable_character_translation;
 383 /* Standard translation table to look up on decoding (reading).  */
 384 Lisp_Object Vstandard_translation_table_for_decode;
 385 /* Standard translation table to look up on encoding (writing).  */
 386 Lisp_Object Vstandard_translation_table_for_encode;
 387
 388 Lisp_Object Qtranslation_table;
 389 Lisp_Object Qtranslation_table_id;
 390 Lisp_Object Qtranslation_table_for_decode;
 391 Lisp_Object Qtranslation_table_for_encode;
 392
 393 /* Alist of charsets vs revision number.  */
 394 static Lisp_Object Vcharset_revision_table;
 395
 396 /* Default coding systems used for process I/O.  */
 397 Lisp_Object Vdefault_process_coding_system;
 398
 399 /* Global flag to tell that we can't call post-read-conversion and
 400    pre-write-conversion functions.  Usually the value is zero, but it
 401    is set to 1 temporarily while such functions are running.  This is
 402    to avoid infinite recursive call.  */
 403 static int inhibit_pre_post_conversion;
 404
 405 /* Char-table containing safe coding systems of each character.  */
 406 Lisp_Object Vchar_coding_system_table;
 407 Lisp_Object Qchar_coding_system;
 408
 409 /* Two special coding systems.  */
 410 Lisp_Object Vsjis_coding_system;
 411 Lisp_Object Vbig5_coding_system;
 412
 413
 414 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
 415 static void decode_coding_utf_8 P_ ((struct coding_system *));
 416 static int encode_coding_utf_8 P_ ((struct coding_system *));
 417
 418 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
 419 static void decode_coding_utf_16 P_ ((struct coding_system *));
 420 static int encode_coding_utf_16 P_ ((struct coding_system *));
 421
 422 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
 423 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 424 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 425
 426 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
 427 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 428 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 429
 430 static int detect_coding_sjis P_ ((struct coding_system *, int *));
 431 static void decode_coding_sjis P_ ((struct coding_system *));
 432 static int encode_coding_sjis P_ ((struct coding_system *));
 433
 434 static int detect_coding_big5 P_ ((struct coding_system *, int *));
 435 static void decode_coding_big5 P_ ((struct coding_system *));
 436 static int encode_coding_big5 P_ ((struct coding_system *));
 437
 438 static int detect_coding_ccl P_ ((struct coding_system *, int *));
 439 static void decode_coding_ccl P_ ((struct coding_system *));
 440 static int encode_coding_ccl P_ ((struct coding_system *));
 441
 442 static void decode_coding_raw_text P_ ((struct coding_system *));
 443 static int encode_coding_raw_text P_ ((struct coding_system *));
 444
 445
 446 /* ISO2022 section */
 447
 448 #define CODING_ISO_INITIAL(coding, reg)                 \
 449   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 450                      coding_attr_iso_initial),          \
 451                reg)))
 452
 453
 454 #define CODING_ISO_REQUEST(coding, charset_id)  \
 455   ((charset_id <= (coding)->max_charset_id      \
 456     ? (coding)->safe_charsets[charset_id]       \
 457     : -1))
 458
 459
 460 #define CODING_ISO_FLAGS(coding)        \
 461   ((coding)->spec.iso_2022.flags)
 462 #define CODING_ISO_DESIGNATION(coding, reg)     \
 463   ((coding)->spec.iso_2022.current_designation[reg])
 464 #define CODING_ISO_INVOCATION(coding, plane)    \
 465   ((coding)->spec.iso_2022.current_invocation[plane])
 466 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 467   ((coding)->spec.iso_2022.single_shifting)
 468 #define CODING_ISO_BOL(coding)  \
 469   ((coding)->spec.iso_2022.bol)
 470 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 471   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 472
 473 /* Control characters of ISO2022.  */
 474                         /* code */      /* function */
 475 #define ISO_CODE_LF     0x0A            /* line-feed */
 476 #define ISO_CODE_CR     0x0D            /* carriage-return */
 477 #define ISO_CODE_SO     0x0E            /* shift-out */
 478 #define ISO_CODE_SI     0x0F            /* shift-in */
 479 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 480 #define ISO_CODE_ESC    0x1B            /* escape */
 481 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 482 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 483 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 484
 485 /* All code (1-byte) of ISO2022 is classified into one of the
 486    followings.  */
 487 enum iso_code_class_type
 488   {
 489     ISO_control_0,              /* Control codes in the range
 490                                    0x00..0x1F and 0x7F, except for the
 491                                    following 5 codes.  */
 492     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 493     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 494     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 495     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 496     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 497     ISO_control_1,              /* Control codes in the range
 498                                    0x80..0x9F, except for the
 499                                    following 3 codes.  */
 500     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 501     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 502     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 503     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 504     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 505     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 506     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 507   };
 508
 509 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 510     `iso-flags' attribute of an iso2022 coding system.  */
 511
 512 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 513    instead of the correct short-form sequence (e.g. ESC $ A).  */
 514 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 515
 516 /* If set, reset graphic planes and registers at end-of-line to the
 517    initial state.  */
 518 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 519
 520 /* If set, reset graphic planes and registers before any control
 521    characters to the initial state.  */
 522 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 523
 524 /* If set, encode by 7-bit environment.  */
 525 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 526
 527 /* If set, use locking-shift function.  */
 528 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 529
 530 /* If set, use single-shift function.  Overwrite
 531    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 532 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 533
 534 /* If set, use designation escape sequence.  */
 535 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 536
 537 /* If set, produce revision number sequence.  */
 538 #define CODING_ISO_FLAG_REVISION        0x0080
 539
 540 /* If set, produce ISO6429's direction specifying sequence.  */
 541 #define CODING_ISO_FLAG_DIRECTION       0x0100
 542
 543 /* If set, assume designation states are reset at beginning of line on
 544    output.  */
 545 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 546
 547 /* If set, designation sequence should be placed at beginning of line
 548    on output.  */
 549 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 550
 551 /* If set, do not encode unsafe charactes on output.  */
 552 #define CODING_ISO_FLAG_SAFE            0x0800
 553
 554 /* If set, extra latin codes (128..159) are accepted as a valid code
 555    on input.  */
 556 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 557
 558 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 559
 560 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 561
 562 #define CODING_ISO_FLAG_FULL_SUPPORT    0x8000
 563
 564 /* A character to be produced on output if encoding of the original
 565    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 566 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 567
 568
 569 /* UTF-16 section */
 570 #define CODING_UTF_16_BOM(coding)       \
 571   ((coding)->spec.utf_16.bom)
 572
 573 #define CODING_UTF_16_ENDIAN(coding)    \
 574   ((coding)->spec.utf_16.endian)
 575
 576 #define CODING_UTF_16_SURROGATE(coding) \
 577   ((coding)->spec.utf_16.surrogate)
 578
 579
 580 /* CCL section */
 581 #define CODING_CCL_DECODER(coding)      \
 582   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 583 #define CODING_CCL_ENCODER(coding)      \
 584   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 585 #define CODING_CCL_VALIDS(coding)                                          \
 586   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 587    ->data)
 588
 589 /* Index for each coding category in `coding_category_table' */
 590
 591 enum coding_category
 592   {
 593     coding_category_iso_7,
 594     coding_category_iso_7_tight,
 595     coding_category_iso_8_1,
 596     coding_category_iso_8_2,
 597     coding_category_iso_7_else,
 598     coding_category_iso_8_else,
 599     coding_category_utf_8,
 600     coding_category_utf_16_auto,
 601     coding_category_utf_16_be,
 602     coding_category_utf_16_le,
 603     coding_category_utf_16_be_nosig,
 604     coding_category_utf_16_le_nosig,
 605     coding_category_charset,
 606     coding_category_sjis,
 607     coding_category_big5,
 608     coding_category_ccl,
 609     coding_category_emacs_mule,
 610     /* All above are targets of code detection.  */
 611     coding_category_raw_text,
 612     coding_category_undecided,
 613     coding_category_max
 614   };
 615
 616 /* Definitions of flag bits used in detect_coding_XXXX.  */
 617 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 618 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 619 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 620 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 621 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 622 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 623 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633
 634 /* This value is returned if detect_coding_mask () find nothing other
 635    than ASCII characters.  */
 636 #define CATEGORY_MASK_ANY               \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_8_1              \
 640    | CATEGORY_MASK_ISO_8_2              \
 641    | CATEGORY_MASK_ISO_7_ELSE           \
 642    | CATEGORY_MASK_ISO_8_ELSE           \
 643    | CATEGORY_MASK_UTF_8                \
 644    | CATEGORY_MASK_UTF_16_BE            \
 645    | CATEGORY_MASK_UTF_16_LE            \
 646    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 647    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 648    | CATEGORY_MASK_CHARSET              \
 649    | CATEGORY_MASK_SJIS                 \
 650    | CATEGORY_MASK_BIG5                 \
 651    | CATEGORY_MASK_CCL                  \
 652    | CATEGORY_MASK_EMACS_MULE)
 653
 654
 655 #define CATEGORY_MASK_ISO_7BIT \
 656   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 657
 658 #define CATEGORY_MASK_ISO_8BIT \
 659   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 660
 661 #define CATEGORY_MASK_ISO_ELSE \
 662   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 663
 664 #define CATEGORY_MASK_ISO_ESCAPE        \
 665   (CATEGORY_MASK_ISO_7                  \
 666    | CATEGORY_MASK_ISO_7_TIGHT          \
 667    | CATEGORY_MASK_ISO_7_ELSE           \
 668    | CATEGORY_MASK_ISO_8_ELSE)
 669
 670 #define CATEGORY_MASK_ISO       \
 671   (  CATEGORY_MASK_ISO_7BIT     \
 672      | CATEGORY_MASK_ISO_8BIT   \
 673      | CATEGORY_MASK_ISO_ELSE)
 674
 675 #define CATEGORY_MASK_UTF_16            \
 676   (CATEGORY_MASK_UTF_16_BE              \
 677    | CATEGORY_MASK_UTF_16_LE            \
 678    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 679    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 680
 681
 682 /* List of symbols `coding-category-xxx' ordered by priority.  This
 683    variable is exposed to Emacs Lisp.  */
 684 static Lisp_Object Vcoding_category_list;
 685
 686 /* Table of coding categories (Lisp symbols).  This variable is for
 687    internal use oly.  */
 688 static Lisp_Object Vcoding_category_table;
 689
 690 /* Table of coding-categories ordered by priority.  */
 691 static enum coding_category coding_priorities[coding_category_max];
 692
 693 /* Nth element is a coding context for the coding system bound to the
 694    Nth coding category.  */
 695 static struct coding_system coding_categories[coding_category_max];
 696
 697 static int detected_mask[coding_category_raw_text] =
 698   { CATEGORY_MASK_ISO,
 699     CATEGORY_MASK_ISO,
 700     CATEGORY_MASK_ISO,
 701     CATEGORY_MASK_ISO,
 702     CATEGORY_MASK_ISO,
 703     CATEGORY_MASK_ISO,
 704     CATEGORY_MASK_UTF_8,
 705     CATEGORY_MASK_UTF_16,
 706     CATEGORY_MASK_UTF_16,
 707     CATEGORY_MASK_UTF_16,
 708     CATEGORY_MASK_UTF_16,
 709     CATEGORY_MASK_UTF_16,
 710     CATEGORY_MASK_CHARSET,
 711     CATEGORY_MASK_SJIS,
 712     CATEGORY_MASK_BIG5,
 713     CATEGORY_MASK_CCL,
 714     CATEGORY_MASK_EMACS_MULE
 715   };
 716
 717 /*** Commonly used macros and functions ***/
 718
 719 #ifndef min
 720 #define min(a, b) ((a) < (b) ? (a) : (b))
 721 #endif
 722 #ifndef max
 723 #define max(a, b) ((a) > (b) ? (a) : (b))
 724 #endif
 725
 726 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 727   do {                                                          \
 728     attrs = CODING_ID_ATTRS (coding->id);                       \
 729     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 730     if (VECTORP (eol_type))                                     \
 731       eol_type = Qunix;                                         \
 732     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 733   } while (0)
 734
 735
 736 /* Safely get one byte from the source text pointed by SRC which ends
 737    at SRC_END, and set C to that byte.  If there are not enough bytes
 738    in the source, it jumps to `no_more_source'.  The caller
 739    should declare and set these variables appropriately in advance:
 740         src, src_end, multibytep
 741 */
 742
 743 #define ONE_MORE_BYTE(c)                                        \
 744   do {                                                          \
 745     if (src == src_end)                                         \
 746       {                                                         \
 747         if (src_base < src)                                     \
 748           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 749         goto no_more_source;                                    \
 750       }                                                         \
 751     c = *src++;                                                 \
 752     if (multibytep && (c & 0x80))                               \
 753       {                                                         \
 754         if ((c & 0xFE) != 0xC0)                                 \
 755           error ("Undecodable char found");                     \
 756         c = ((c & 1) << 6) | *src++;                            \
 757       }                                                         \
 758     consumed_chars++;                                           \
 759   } while (0)
 760
 761
 762 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 763   do {                                          \
 764     c = *src++;                                 \
 765     if (multibytep && (c & 0x80))               \
 766       {                                         \
 767         if ((c & 0xFE) != 0xC0)                 \
 768           error ("Undecodable char found");     \
 769         c = ((c & 1) << 6) | *src++;            \
 770       }                                         \
 771   } while (0)
 772
 773
 774 /* Store a byte C in the place pointed by DST and increment DST to the
 775    next free point, and increment PRODUCED_CHARS.  The caller should
 776    assure that C is 0..127, and declare and set the variable `dst'
 777    appropriately in advance.
 778 */
 779
 780
 781 #define EMIT_ONE_ASCII_BYTE(c)  \
 782   do {                          \
 783     produced_chars++;           \
 784     *dst++ = (c);               \
 785   } while (0)
 786
 787
 788 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 789
 790 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 791   do {                                  \
 792     produced_chars += 2;                \
 793     *dst++ = (c1), *dst++ = (c2);       \
 794   } while (0)
 795
 796
 797 /* Store a byte C in the place pointed by DST and increment DST to the
 798    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 799    nonzero, store in an appropriate multibyte from.  The caller should
 800    declare and set the variables `dst' and `multibytep' appropriately
 801    in advance.  */
 802
 803 #define EMIT_ONE_BYTE(c)                \
 804   do {                                  \
 805     produced_chars++;                   \
 806     if (multibytep)                     \
 807       {                                 \
 808         int ch = (c);                   \
 809         if (ch >= 0x80)                 \
 810           ch = BYTE8_TO_CHAR (ch);      \
 811         CHAR_STRING_ADVANCE (ch, dst);  \
 812       }                                 \
 813     else                                \
 814       *dst++ = (c);                     \
 815   } while (0)
 816
 817
 818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 819
 820 #define EMIT_TWO_BYTES(c1, c2)          \
 821   do {                                  \
 822     produced_chars += 2;                \
 823     if (multibytep)                     \
 824       {                                 \
 825         int ch;                         \
 826                                         \
 827         ch = (c1);                      \
 828         if (ch >= 0x80)                 \
 829           ch = BYTE8_TO_CHAR (ch);      \
 830         CHAR_STRING_ADVANCE (ch, dst);  \
 831         ch = (c2);                      \
 832         if (ch >= 0x80)                 \
 833           ch = BYTE8_TO_CHAR (ch);      \
 834         CHAR_STRING_ADVANCE (ch, dst);  \
 835       }                                 \
 836     else                                \
 837       {                                 \
 838         *dst++ = (c1);                  \
 839         *dst++ = (c2);                  \
 840       }                                 \
 841   } while (0)
 842
 843
 844 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 845   do {                                  \
 846     EMIT_ONE_BYTE (c1);                 \
 847     EMIT_TWO_BYTES (c2, c3);            \
 848   } while (0)
 849
 850
 851 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 852   do {                                          \
 853     EMIT_TWO_BYTES (c1, c2);                    \
 854     EMIT_TWO_BYTES (c3, c4);                    \
 855   } while (0)
 856
 857
 858 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 859   do {                                                                       \
 860     charset_map_loaded = 0;                                                  \
 861     c = DECODE_CHAR (charset, code);                                         \
 862     if (charset_map_loaded)                                                  \
 863       {                                                                      \
 864         unsigned char *orig = coding->source;                                \
 865         EMACS_INT offset;                                                    \
 866                                                                              \
 867         coding_set_source (coding);                                          \
 868         offset = coding->source - orig;                                      \
 869         src += offset;                                                       \
 870         src_base += offset;                                                  \
 871         src_end += offset;                                                   \
 872       }                                                                      \
 873   } while (0)
 874
 875
 876 #define ASSURE_DESTINATION(bytes)                               \
 877   do {                                                          \
 878     if (dst + (bytes) >= dst_end)                               \
 879       {                                                         \
 880         int more_bytes = charbuf_end - charbuf + (bytes);       \
 881                                                                 \
 882         dst = alloc_destination (coding, more_bytes, dst);      \
 883         dst_end = coding->destination + coding->dst_bytes;      \
 884       }                                                         \
 885   } while (0)
 886
 887
 888
 889 static void
 890 coding_set_source (coding)
 891      struct coding_system *coding;
 892 {
 893   if (BUFFERP (coding->src_object))
 894     {
 895       if (coding->src_pos < 0)
 896         coding->source = GAP_END_ADDR + coding->src_pos_byte;
 897       else
 898         {
 899           struct buffer *buf = XBUFFER (coding->src_object);
 900           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
 901           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
 902
 903           coding->source = beg_addr + coding->src_pos_byte - 1;
 904           if (coding->src_pos_byte >= gpt_byte)
 905             coding->source += BUF_GAP_SIZE (buf);
 906         }
 907     }
 908   else if (STRINGP (coding->src_object))
 909     {
 910       coding->source = (XSTRING (coding->src_object)->data
 911                         + coding->src_pos_byte);
 912     }
 913   else
 914     /* Otherwise, the source is C string and is never relocated
 915        automatically.  Thus we don't have to update anything.  */
 916     ;
 917 }
 918
 919 static void
 920 coding_set_destination (coding)
 921      struct coding_system *coding;
 922 {
 923   if (BUFFERP (coding->dst_object))
 924     {
 925       /* We are sure that coding->dst_pos_byte is before the gap of the
 926          buffer. */
 927       coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 928                              + coding->dst_pos_byte - 1);
 929       if (coding->src_pos < 0)
 930         coding->dst_bytes = (GAP_END_ADDR
 931                              - (coding->src_bytes - coding->consumed)
 932                              - coding->destination);
 933       else
 934         coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 935                              - coding->destination);
 936     }
 937   else
 938     /* Otherwise, the destination is C string and is never relocated
 939        automatically.  Thus we don't have to update anything.  */
 940     ;
 941 }
 942
 943
 944 static void
 945 coding_alloc_by_realloc (coding, bytes)
 946      struct coding_system *coding;
 947      EMACS_INT bytes;
 948 {
 949   coding->destination = (unsigned char *) xrealloc (coding->destination,
 950                                                     coding->dst_bytes + bytes);
 951   coding->dst_bytes += bytes;
 952 }
 953
 954 static void
 955 coding_alloc_by_making_gap (coding, bytes)
 956      struct coding_system *coding;
 957      EMACS_INT bytes;
 958 {
 959   if (BUFFERP (coding->dst_object)
 960       && EQ (coding->src_object, coding->dst_object))
 961     {
 962       EMACS_INT add = coding->src_bytes - coding->consumed;
 963
 964       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 965       make_gap (bytes);
 966       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 967     }
 968   else
 969     {
 970       Lisp_Object this_buffer;
 971
 972       this_buffer = Fcurrent_buffer ();
 973       set_buffer_internal (XBUFFER (coding->dst_object));
 974       make_gap (bytes);
 975       set_buffer_internal (XBUFFER (this_buffer));
 976     }
 977 }
 978
 979
 980 static unsigned char *
 981 alloc_destination (coding, nbytes, dst)
 982      struct coding_system *coding;
 983      int nbytes;
 984      unsigned char *dst;
 985 {
 986   EMACS_INT offset = dst - coding->destination;
 987
 988   if (BUFFERP (coding->dst_object))
 989     coding_alloc_by_making_gap (coding, nbytes);
 990   else
 991     coding_alloc_by_realloc (coding, nbytes);
 992   coding->result = CODING_RESULT_SUCCESS;
 993   coding_set_destination (coding);
 994   dst = coding->destination + offset;
 995   return dst;
 996 }
 997
 998 \f
 999 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1000
1001
1002
1003 \f
1004 /*** 3. UTF-8 ***/
1005
1006 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1007    Check if a text is encoded in UTF-8.  If it is, return
1008    CATEGORY_MASK_UTF_8, else return 0.  */
1009
1010 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1011 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1012 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1013 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1014 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1015 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1016
1017 static int
1018 detect_coding_utf_8 (coding, mask)
1019      struct coding_system *coding;
1020      int *mask;
1021 {
1022   unsigned char *src = coding->source, *src_base = src;
1023   unsigned char *src_end = coding->source + coding->src_bytes;
1024   int multibytep = coding->src_multibyte;
1025   int consumed_chars = 0;
1026   int found = 0;
1027
1028   /* A coding system of this category is always ASCII compatible.  */
1029   src += coding->head_ascii;
1030
1031   while (1)
1032     {
1033       int c, c1, c2, c3, c4;
1034
1035       ONE_MORE_BYTE (c);
1036       if (UTF_8_1_OCTET_P (c))
1037         continue;
1038       ONE_MORE_BYTE (c1);
1039       if (! UTF_8_EXTRA_OCTET_P (c1))
1040         break;
1041       if (UTF_8_2_OCTET_LEADING_P (c))
1042         {
1043           found++;
1044           continue;
1045         }
1046       ONE_MORE_BYTE (c2);
1047       if (! UTF_8_EXTRA_OCTET_P (c2))
1048         break;
1049       if (UTF_8_3_OCTET_LEADING_P (c))
1050         {
1051           found++;
1052           continue;
1053         }
1054       ONE_MORE_BYTE (c3);
1055       if (! UTF_8_EXTRA_OCTET_P (c3))
1056         break;
1057       if (UTF_8_4_OCTET_LEADING_P (c))
1058         {
1059           found++;
1060           continue;
1061         }
1062       ONE_MORE_BYTE (c4);
1063       if (! UTF_8_EXTRA_OCTET_P (c4))
1064         break;
1065       if (UTF_8_5_OCTET_LEADING_P (c))
1066         {
1067           found++;
1068           continue;
1069         }
1070       break;
1071     }
1072   *mask &= ~CATEGORY_MASK_UTF_8;
1073   return 0;
1074
1075  no_more_source:
1076   if (! found)
1077     return 0;
1078   *mask &= CATEGORY_MASK_UTF_8;
1079   return 1;
1080 }
1081
1082
1083 static void
1084 decode_coding_utf_8 (coding)
1085      struct coding_system *coding;
1086 {
1087   unsigned char *src = coding->source + coding->consumed;
1088   unsigned char *src_end = coding->source + coding->src_bytes;
1089   unsigned char *src_base;
1090   int *charbuf = coding->charbuf;
1091   int *charbuf_end = charbuf + coding->charbuf_size;
1092   int consumed_chars = 0, consumed_chars_base;
1093   int multibytep = coding->src_multibyte;
1094   Lisp_Object attr, eol_type, charset_list;
1095
1096   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1097
1098   while (1)
1099     {
1100       int c, c1, c2, c3, c4, c5;
1101
1102       src_base = src;
1103       consumed_chars_base = consumed_chars;
1104
1105       if (charbuf >= charbuf_end)
1106         break;
1107
1108       ONE_MORE_BYTE (c1);
1109       if (UTF_8_1_OCTET_P(c1))
1110         {
1111           c = c1;
1112           if (c == '\r')
1113             {
1114               if (EQ (eol_type, Qdos))
1115                 {
1116                   if (src == src_end)
1117                     goto no_more_source;
1118                   if (*src == '\n')
1119                     ONE_MORE_BYTE (c);
1120                 }
1121               else if (EQ (eol_type, Qmac))
1122                 c = '\n';
1123             }
1124         }
1125       else
1126         {
1127           ONE_MORE_BYTE (c2);
1128           if (! UTF_8_EXTRA_OCTET_P (c2))
1129             goto invalid_code;
1130           if (UTF_8_2_OCTET_LEADING_P (c1))
1131             c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1132           else
1133             {
1134               ONE_MORE_BYTE (c3);
1135               if (! UTF_8_EXTRA_OCTET_P (c3))
1136                 goto invalid_code;
1137               if (UTF_8_3_OCTET_LEADING_P (c1))
1138                 c = (((c1 & 0xF) << 12)
1139                      | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1140               else
1141                 {
1142                   ONE_MORE_BYTE (c4);
1143                   if (! UTF_8_EXTRA_OCTET_P (c4))
1144                     goto invalid_code;
1145                   if (UTF_8_4_OCTET_LEADING_P (c1))
1146                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1147                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1148                   else
1149                     {
1150                       ONE_MORE_BYTE (c5);
1151                       if (! UTF_8_EXTRA_OCTET_P (c5))
1152                         goto invalid_code;
1153                       if (UTF_8_5_OCTET_LEADING_P (c1))
1154                         {
1155                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1156                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1157                                | (c5 & 0x3F));
1158                           if (c > MAX_CHAR)
1159                             goto invalid_code;
1160                         }
1161                       else
1162                         goto invalid_code;
1163                     }
1164                 }
1165             }
1166         }
1167
1168       *charbuf++ = c;
1169       continue;
1170
1171     invalid_code:
1172       src = src_base;
1173       consumed_chars = consumed_chars_base;
1174       ONE_MORE_BYTE (c);
1175       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1176       coding->errors++;
1177     }
1178
1179  no_more_source:
1180   coding->consumed_char += consumed_chars_base;
1181   coding->consumed = src_base - coding->source;
1182   coding->charbuf_used = charbuf - coding->charbuf;
1183 }
1184
1185
1186 static int
1187 encode_coding_utf_8 (coding)
1188      struct coding_system *coding;
1189 {
1190   int multibytep = coding->dst_multibyte;
1191   int *charbuf = coding->charbuf;
1192   int *charbuf_end = charbuf + coding->charbuf_used;
1193   unsigned char *dst = coding->destination + coding->produced;
1194   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1195   int produced_chars = 0;
1196   int c;
1197
1198   if (multibytep)
1199     {
1200       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1201
1202       while (charbuf < charbuf_end)
1203         {
1204           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1205
1206           ASSURE_DESTINATION (safe_room);
1207           c = *charbuf++;
1208           CHAR_STRING_ADVANCE (c, pend);
1209           for (p = str; p < pend; p++)
1210             EMIT_ONE_BYTE (*p);
1211         }
1212     }
1213   else
1214     {
1215       int safe_room = MAX_MULTIBYTE_LENGTH;
1216
1217       while (charbuf < charbuf_end)
1218         {
1219           ASSURE_DESTINATION (safe_room);
1220           c = *charbuf++;
1221           dst += CHAR_STRING (c, dst);
1222           produced_chars++;
1223         }
1224     }
1225   coding->result = CODING_RESULT_SUCCESS;
1226   coding->produced_char += produced_chars;
1227   coding->produced = dst - coding->destination;
1228   return 0;
1229 }
1230
1231
1232 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1233    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1234    Little Endian (otherwise).  If it is, return
1235    CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1236    else return 0.  */
1237
1238 #define UTF_16_HIGH_SURROGATE_P(val) \
1239   (((val) & 0xFC00) == 0xD800)
1240
1241 #define UTF_16_LOW_SURROGATE_P(val) \
1242   (((val) & 0xFC00) == 0xDC00)
1243
1244 #define UTF_16_INVALID_P(val)   \
1245   (((val) == 0xFFFE)            \
1246    || ((val) == 0xFFFF)         \
1247    || UTF_16_LOW_SURROGATE_P (val))
1248
1249
1250 static int
1251 detect_coding_utf_16 (coding, mask)
1252      struct coding_system *coding;
1253      int *mask;
1254 {
1255   unsigned char *src = coding->source, *src_base = src;
1256   unsigned char *src_end = coding->source + coding->src_bytes;
1257   int multibytep = coding->src_multibyte;
1258   int consumed_chars = 0;
1259   int c1, c2;
1260
1261   ONE_MORE_BYTE (c1);
1262   ONE_MORE_BYTE (c2);
1263
1264   if ((c1 == 0xFF) && (c2 == 0xFE))
1265     {
1266       *mask &= CATEGORY_MASK_UTF_16_LE;
1267       return 1;
1268     }
1269   else if ((c1 == 0xFE) && (c2 == 0xFF))
1270     {
1271       *mask &= CATEGORY_MASK_UTF_16_BE;
1272       return 1;
1273     }
1274  no_more_source:
1275   return 0;
1276 }
1277
1278 static void
1279 decode_coding_utf_16 (coding)
1280      struct coding_system *coding;
1281 {
1282   unsigned char *src = coding->source + coding->consumed;
1283   unsigned char *src_end = coding->source + coding->src_bytes;
1284   unsigned char *src_base;
1285   int *charbuf = coding->charbuf;
1286   int *charbuf_end = charbuf + coding->charbuf_size;
1287   int consumed_chars = 0, consumed_chars_base;
1288   int multibytep = coding->src_multibyte;
1289   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1290   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1291   int surrogate = CODING_UTF_16_SURROGATE (coding);
1292   Lisp_Object attr, eol_type, charset_list;
1293
1294   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1295
1296   if (bom != utf_16_without_bom)
1297     {
1298       int c, c1, c2;
1299
1300       src_base = src;
1301       ONE_MORE_BYTE (c1);
1302       ONE_MORE_BYTE (c2);
1303       c = (c1 << 8) | c2;
1304       if (bom == utf_16_with_bom)
1305         {
1306           if (endian == utf_16_big_endian
1307               ? c != 0xFFFE : c != 0xFEFF)
1308             {
1309               /* We are sure that there's enouph room at CHARBUF.  */
1310               *charbuf++ = c1;
1311               *charbuf++ = c2;
1312               coding->errors++;
1313             }
1314         }
1315       else
1316         {
1317           if (c == 0xFFFE)
1318             CODING_UTF_16_ENDIAN (coding)
1319               = endian = utf_16_big_endian;
1320           else if (c == 0xFEFF)
1321             CODING_UTF_16_ENDIAN (coding)
1322               = endian = utf_16_little_endian;
1323           else
1324             {
1325               CODING_UTF_16_ENDIAN (coding)
1326                 = endian = utf_16_big_endian;
1327               src = src_base;
1328             }
1329         }
1330       CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1331     }
1332
1333   while (1)
1334     {
1335       int c, c1, c2;
1336
1337       src_base = src;
1338       consumed_chars_base = consumed_chars;
1339
1340       if (charbuf + 2 >= charbuf_end)
1341         break;
1342
1343       ONE_MORE_BYTE (c1);
1344       ONE_MORE_BYTE (c2);
1345       c = (endian == utf_16_big_endian
1346            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1347       if (surrogate)
1348         {
1349           if (! UTF_16_LOW_SURROGATE_P (c))
1350             {
1351               if (endian == utf_16_big_endian)
1352                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1353               else
1354                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1355               *charbuf++ = c1;
1356               *charbuf++ = c2;
1357               coding->errors++;
1358               if (UTF_16_HIGH_SURROGATE_P (c))
1359                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1360               else
1361                 *charbuf++ = c;
1362             }
1363           else
1364             {
1365               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1366               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1367               *charbuf++ = c;
1368             }
1369         }
1370       else
1371         {
1372           if (UTF_16_HIGH_SURROGATE_P (c))
1373             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1374           else
1375             *charbuf++ = c;
1376         }
1377     }
1378
1379  no_more_source:
1380   coding->consumed_char += consumed_chars_base;
1381   coding->consumed = src_base - coding->source;
1382   coding->charbuf_used = charbuf - coding->charbuf;
1383 }
1384
1385 static int
1386 encode_coding_utf_16 (coding)
1387      struct coding_system *coding;
1388 {
1389   int multibytep = coding->dst_multibyte;
1390   int *charbuf = coding->charbuf;
1391   int *charbuf_end = charbuf + coding->charbuf_used;
1392   unsigned char *dst = coding->destination + coding->produced;
1393   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1394   int safe_room = 8;
1395   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1396   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1397   int produced_chars = 0;
1398   Lisp_Object attrs, eol_type, charset_list;
1399   int c;
1400
1401   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1402
1403   if (bom == utf_16_with_bom)
1404     {
1405       ASSURE_DESTINATION (safe_room);
1406       if (big_endian)
1407         EMIT_TWO_BYTES (0xFF, 0xFE);
1408       else
1409         EMIT_TWO_BYTES (0xFE, 0xFF);
1410       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1411     }
1412
1413   while (charbuf < charbuf_end)
1414     {
1415       ASSURE_DESTINATION (safe_room);
1416       c = *charbuf++;
1417       if (c >= MAX_UNICODE_CHAR)
1418         c = coding->default_char;
1419
1420       if (c < 0x10000)
1421         {
1422           if (big_endian)
1423             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1424           else
1425             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1426         }
1427       else
1428         {
1429           int c1, c2;
1430
1431           c -= 0x10000;
1432           c1 = (c >> 10) + 0xD800;
1433           c2 = (c & 0x3FF) + 0xDC00;
1434           if (big_endian)
1435             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1436           else
1437             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1438         }
1439     }
1440   coding->result = CODING_RESULT_SUCCESS;
1441   coding->produced = dst - coding->destination;
1442   coding->produced_char += produced_chars;
1443   return 0;
1444 }
1445
1446 \f
1447 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1448
1449 /* Emacs' internal format for representation of multiple character
1450    sets is a kind of multi-byte encoding, i.e. characters are
1451    represented by variable-length sequences of one-byte codes.
1452
1453    ASCII characters and control characters (e.g. `tab', `newline') are
1454    represented by one-byte sequences which are their ASCII codes, in
1455    the range 0x00 through 0x7F.
1456
1457    8-bit characters of the range 0x80..0x9F are represented by
1458    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1459    code + 0x20).
1460
1461    8-bit characters of the range 0xA0..0xFF are represented by
1462    one-byte sequences which are their 8-bit code.
1463
1464    The other characters are represented by a sequence of `base
1465    leading-code', optional `extended leading-code', and one or two
1466    `position-code's.  The length of the sequence is determined by the
1467    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1468    whereas extended leading-code and position-code take the range 0xA0
1469    through 0xFF.  See `charset.h' for more details about leading-code
1470    and position-code.
1471
1472    --- CODE RANGE of Emacs' internal format ---
1473    character set        range
1474    -------------        -----
1475    ascii                0x00..0x7F
1476    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1477    eight-bit-graphic    0xA0..0xBF
1478    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1479    ---------------------------------------------
1480
1481    As this is the internal character representation, the format is
1482    usually not used externally (i.e. in a file or in a data sent to a
1483    process).  But, it is possible to have a text externally in this
1484    format (i.e. by encoding by the coding system `emacs-mule').
1485
1486    In that case, a sequence of one-byte codes has a slightly different
1487    form.
1488
1489    At first, all characters in eight-bit-control are represented by
1490    one-byte sequences which are their 8-bit code.
1491
1492    Next, character composition data are represented by the byte
1493    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1494    where,
1495         METHOD is 0xF0 plus one of composition method (enum
1496         composition_method),
1497
1498         BYTES is 0xA0 plus a byte length of this composition data,
1499
1500         CHARS is 0x20 plus a number of characters composed by this
1501         data,
1502
1503         COMPONENTs are characters of multibye form or composition
1504         rules encoded by two-byte of ASCII codes.
1505
1506    In addition, for backward compatibility, the following formats are
1507    also recognized as composition data on decoding.
1508
1509    0x80 MSEQ ...
1510    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1511
1512    Here,
1513         MSEQ is a multibyte form but in these special format:
1514           ASCII: 0xA0 ASCII_CODE+0x80,
1515           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1516         RULE is a one byte code of the range 0xA0..0xF0 that
1517         represents a composition rule.
1518   */
1519
1520 char emacs_mule_bytes[256];
1521
1522 /* Leading-code followed by extended leading-code.  */
1523 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1524 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1525 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1526 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1527
1528
1529 int
1530 emacs_mule_char (coding, composition, nbytes, nchars)
1531      struct coding_system *coding;
1532      int composition;
1533      int *nbytes, *nchars;
1534 {
1535   unsigned char *src = coding->source + coding->consumed;
1536   unsigned char *src_end = coding->source + coding->src_bytes;
1537   int multibytep = coding->src_multibyte;
1538   unsigned char *src_base = src;
1539   struct charset *charset;
1540   unsigned code;
1541   int c;
1542   int consumed_chars = 0;
1543
1544   ONE_MORE_BYTE (c);
1545   if (composition)
1546     {
1547       c -= 0x20;
1548       if (c == 0x80)
1549         {
1550           ONE_MORE_BYTE (c);
1551           if (c < 0xA0)
1552             goto invalid_code;
1553           *nbytes = src - src_base;
1554           *nchars = consumed_chars;
1555           return (c - 0x80);
1556         }
1557     }
1558
1559   switch (emacs_mule_bytes[c])
1560     {
1561     case 2:
1562       if (! (charset = emacs_mule_charset[c]))
1563         goto invalid_code;
1564       ONE_MORE_BYTE (c);
1565       code = c & 0x7F;
1566       break;
1567
1568     case 3:
1569       if (c == LEADING_CODE_PRIVATE_11
1570           || c == LEADING_CODE_PRIVATE_12)
1571         {
1572           ONE_MORE_BYTE (c);
1573           if (! (charset = emacs_mule_charset[c]))
1574             goto invalid_code;
1575           ONE_MORE_BYTE (c);
1576           code = c & 0x7F;
1577         }
1578       else
1579         {
1580           if (! (charset = emacs_mule_charset[c]))
1581             goto invalid_code;
1582           ONE_MORE_BYTE (c);
1583           code = (c & 0x7F) << 7;
1584           ONE_MORE_BYTE (c);
1585           code |= c & 0x7F;
1586         }
1587       break;
1588
1589     case 4:
1590       if (! (charset = emacs_mule_charset[c]))
1591         goto invalid_code;
1592       ONE_MORE_BYTE (c);
1593       code = (c & 0x7F) << 7;
1594       ONE_MORE_BYTE (c);
1595       code |= c & 0x7F;
1596       break;
1597
1598     case 1:
1599       code = c;
1600       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1601                                  : code < 0xA0 ? charset_8_bit_control
1602                                  : charset_8_bit_graphic);
1603       break;
1604
1605     default:
1606       abort ();
1607     }
1608   c = DECODE_CHAR (charset, code);
1609   if (c < 0)
1610     goto invalid_code;
1611   *nbytes = src - src_base;
1612   *nchars = consumed_chars;
1613   return c;
1614
1615  no_more_source:
1616   return -2;
1617
1618  invalid_code:
1619   return -1;
1620 }
1621
1622
1623 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1624    Check if a text is encoded in `emacs-mule'.  */
1625
1626 static int
1627 detect_coding_emacs_mule (coding, mask)
1628      struct coding_system *coding;
1629      int *mask;
1630 {
1631   unsigned char *src = coding->source, *src_base = src;
1632   unsigned char *src_end = coding->source + coding->src_bytes;
1633   int multibytep = coding->src_multibyte;
1634   int consumed_chars = 0;
1635   int c;
1636   int found = 0;
1637
1638   /* A coding system of this category is always ASCII compatible.  */
1639   src += coding->head_ascii;
1640
1641   while (1)
1642     {
1643       ONE_MORE_BYTE (c);
1644
1645       if (c == 0x80)
1646         {
1647           /* Perhaps the start of composite character.  We simple skip
1648              it because analyzing it is too heavy for detecting.  But,
1649              at least, we check that the composite character
1650              constitues of more than 4 bytes.  */
1651           unsigned char *src_base;
1652
1653         repeat:
1654           src_base = src;
1655           do
1656             {
1657               ONE_MORE_BYTE (c);
1658             }
1659           while (c >= 0xA0);
1660
1661           if (src - src_base <= 4)
1662             break;
1663           found = 1;
1664           if (c == 0x80)
1665             goto repeat;
1666         }
1667
1668       if (c < 0x80)
1669         {
1670           if (c < 0x20
1671               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1672             break;
1673         }
1674       else
1675         {
1676           unsigned char *src_base = src - 1;
1677
1678           do
1679             {
1680               ONE_MORE_BYTE (c);
1681             }
1682           while (c >= 0xA0);
1683           if (src - src_base != emacs_mule_bytes[*src_base])
1684             break;
1685           found = 1;
1686         }
1687     }
1688   *mask &= ~CATEGORY_MASK_EMACS_MULE;
1689   return 0;
1690
1691  no_more_source:
1692   if (!found)
1693     return 0;
1694   *mask &= CATEGORY_MASK_EMACS_MULE;
1695   return 1;
1696 }
1697
1698
1699 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1700
1701 /* Decode a character represented as a component of composition
1702    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1703    update SRC to the head of next character (or an encoded composition
1704    rule).  If SRC doesn't points a composition component, set C to -1.
1705    If SRC points an invalid byte sequence, global exit by a return
1706    value 0.  */
1707
1708 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1709   if (1)                                                        \
1710     {                                                           \
1711       int c;                                                    \
1712       int nbytes, nchars;                                       \
1713                                                                 \
1714       if (src == src_end)                                       \
1715         break;                                                  \
1716       c = emacs_mule_char (coding, 1, &nbytes, &nchars);        \
1717       if (c < 0)                                                \
1718         {                                                       \
1719           if (c == -2)                                          \
1720             break;                                              \
1721           goto invalid_code;                                    \
1722         }                                                       \
1723       *buf++ = c;                                               \
1724       src += nbytes;                                            \
1725       consumed_chars += nchars;                                 \
1726     }                                                           \
1727   else
1728
1729
1730 /* Decode a composition rule represented as a component of composition
1731    sequence of Emacs 20 style at SRC.  Set C to the rule.  If SRC
1732    points an invalid byte sequence, set C to -1.  */
1733
1734 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf)         \
1735   do {                                                  \
1736     int c, gref, nref;                                  \
1737                                                         \
1738     if (src < src_end)                                  \
1739       goto invalid_code;                                \
1740     ONE_MORE_BYTE_NO_CHECK (c);                         \
1741     c -= 0xA0;                                          \
1742     if (c < 0 || c >= 81)                               \
1743       goto invalid_code;                                \
1744                                                         \
1745     gref = c / 9, nref = c % 9;                         \
1746     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1747   } while (0)
1748
1749
1750 #define ADD_COMPOSITION_DATA(buf, method, nchars)       \
1751   do {                                                  \
1752     *buf++ = -5;                                        \
1753     *buf++ = coding->produced_char + char_offset;       \
1754     *buf++ = CODING_ANNOTATE_COMPOSITION_MASK;          \
1755     *buf++ = method;                                    \
1756     *buf++ = nchars;                                    \
1757   } while (0)
1758
1759
1760 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1761   do {                                                                  \
1762     /* Emacs 21 style format.  The first three bytes at SRC are         \
1763        (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1764        the byte length of this composition information, CHARS is the    \
1765        number of characters composed by this composition.  */           \
1766     enum composition_method method = c - 0xF0;                          \
1767     int consumed_chars_limit;                                           \
1768     int nbytes, nchars;                                                 \
1769                                                                         \
1770     ONE_MORE_BYTE (c);                                                  \
1771     nbytes = c - 0xA0;                                                  \
1772     if (nbytes < 3)                                                     \
1773       goto invalid_code;                                                \
1774     ONE_MORE_BYTE (c);                                                  \
1775     nchars = c - 0xA0;                                                  \
1776     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
1777     consumed_chars_limit = consumed_chars_base + nbytes;                \
1778     if (method != COMPOSITION_RELATIVE)                                 \
1779       {                                                                 \
1780         int i = 0;                                                      \
1781         while (consumed_chars < consumed_chars_limit)                   \
1782           {                                                             \
1783             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1784               DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf);             \
1785             else                                                        \
1786               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1787           }                                                             \
1788         if (consumed_chars < consumed_chars_limit)                      \
1789           goto invalid_code;                                            \
1790       }                                                                 \
1791   } while (0)
1792
1793
1794 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1795   do {                                                          \
1796     /* Emacs 20 style format for relative composition.  */      \
1797     /* Store multibyte form of characters to be composed.  */   \
1798     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1799     int *buf = components;                                      \
1800     int i, j;                                                   \
1801                                                                 \
1802     src = src_base;                                             \
1803     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1804     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1805       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1806     if (i < 2)                                                  \
1807       goto invalid_code;                                        \
1808     ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i);    \
1809     for (j = 0; j < i; j++)                                     \
1810       *charbuf++ = components[j];                               \
1811   } while (0)
1812
1813
1814 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1815   do {                                                          \
1816     /* Emacs 20 style format for rule-base composition.  */     \
1817     /* Store multibyte form of characters to be composed.  */   \
1818     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1819     int *buf = components;                                      \
1820     int i, j;                                                   \
1821                                                                 \
1822     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1823     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1824       {                                                         \
1825         DECODE_EMACS_MULE_COMPOSITION_RULE (buf);               \
1826         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1827       }                                                         \
1828     if (i < 1 || (buf - components) % 2 == 0)                   \
1829       goto invalid_code;                                        \
1830     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1831       goto no_more_source;                                      \
1832     ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i);       \
1833     for (j = 0; j < i; j++)                                     \
1834       *charbuf++ = components[j];                               \
1835     for (j = 0; j < i; j += 2)                                  \
1836       *charbuf++ = components[j];                               \
1837   } while (0)
1838
1839
1840 static void
1841 decode_coding_emacs_mule (coding)
1842      struct coding_system *coding;
1843 {
1844   unsigned char *src = coding->source + coding->consumed;
1845   unsigned char *src_end = coding->source + coding->src_bytes;
1846   unsigned char *src_base;
1847   int *charbuf = coding->charbuf;
1848   int *charbuf_end = charbuf + coding->charbuf_size;
1849   int consumed_chars = 0, consumed_chars_base;
1850   int char_offset = 0;
1851   int multibytep = coding->src_multibyte;
1852   Lisp_Object attrs, eol_type, charset_list;
1853
1854   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1855
1856   while (1)
1857     {
1858       int c;
1859
1860       src_base = src;
1861       consumed_chars_base = consumed_chars;
1862
1863       if (charbuf >= charbuf_end)
1864         break;
1865
1866       ONE_MORE_BYTE (c);
1867
1868       if (c < 0x80)
1869         {
1870           if (c == '\r')
1871             {
1872               if (EQ (eol_type, Qdos))
1873                 {
1874                   if (src == src_end)
1875                     goto no_more_source;
1876                   if (*src == '\n')
1877                     ONE_MORE_BYTE (c);
1878                 }
1879               else if (EQ (eol_type, Qmac))
1880                 c = '\n';
1881             }
1882           *charbuf++ = c;
1883           char_offset++;
1884         }
1885       else if (c == 0x80)
1886         {
1887           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1888             break;
1889           ONE_MORE_BYTE (c);
1890           if (c - 0xF0 >= COMPOSITION_RELATIVE
1891               && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1892             DECODE_EMACS_MULE_21_COMPOSITION (c);
1893           else if (c < 0xC0)
1894             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1895           else if (c == 0xFF)
1896             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1897           else
1898             goto invalid_code;
1899         }
1900       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1901         {
1902           int nbytes, nchars;
1903           src--;
1904           c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1905           if (c < 0)
1906             {
1907               if (c == -2)
1908                 break;
1909               goto invalid_code;
1910             }
1911           *charbuf++ = c;
1912           char_offset++;
1913         }
1914       continue;
1915
1916     invalid_code:
1917       src = src_base;
1918       consumed_chars = consumed_chars_base;
1919       ONE_MORE_BYTE (c);
1920       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1921       coding->errors++;
1922     }
1923
1924  no_more_source:
1925   coding->consumed_char += consumed_chars_base;
1926   coding->consumed = src_base - coding->source;
1927   coding->charbuf_used = charbuf - coding->charbuf;
1928 }
1929
1930
1931 #define EMACS_MULE_LEADING_CODES(id, codes)     \
1932   do {                                          \
1933     if (id < 0xA0)                              \
1934       codes[0] = id, codes[1] = 0;              \
1935     else if (id < 0xE0)                         \
1936       codes[0] = 0x9A, codes[1] = id;           \
1937     else if (id < 0xF0)                         \
1938       codes[0] = 0x9B, codes[1] = id;           \
1939     else if (id < 0xF5)                         \
1940       codes[0] = 0x9C, codes[1] = id;           \
1941     else                                        \
1942       codes[0] = 0x9D, codes[1] = id;           \
1943   } while (0);
1944
1945
1946 static int
1947 encode_coding_emacs_mule (coding)
1948      struct coding_system *coding;
1949 {
1950   int multibytep = coding->dst_multibyte;
1951   int *charbuf = coding->charbuf;
1952   int *charbuf_end = charbuf + coding->charbuf_used;
1953   unsigned char *dst = coding->destination + coding->produced;
1954   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1955   int safe_room = 8;
1956   int produced_chars = 0;
1957   Lisp_Object attrs, eol_type, charset_list;
1958   int c;
1959
1960   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1961
1962   while (charbuf < charbuf_end)
1963     {
1964       ASSURE_DESTINATION (safe_room);
1965       c = *charbuf++;
1966       if (ASCII_CHAR_P (c))
1967         EMIT_ONE_ASCII_BYTE (c);
1968       else
1969         {
1970           struct charset *charset;
1971           unsigned code;
1972           int dimension;
1973           int emacs_mule_id;
1974           unsigned char leading_codes[2];
1975
1976           charset = char_charset (c, charset_list, &code);
1977           if (! charset)
1978             {
1979               c = coding->default_char;
1980               if (ASCII_CHAR_P (c))
1981                 {
1982                   EMIT_ONE_ASCII_BYTE (c);
1983                   continue;
1984                 }
1985               charset = char_charset (c, charset_list, &code);
1986             }
1987           dimension = CHARSET_DIMENSION (charset);
1988           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1989           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1990           EMIT_ONE_BYTE (leading_codes[0]);
1991           if (leading_codes[1])
1992             EMIT_ONE_BYTE (leading_codes[1]);
1993           if (dimension == 1)
1994             EMIT_ONE_BYTE (code);
1995           else
1996             {
1997               EMIT_ONE_BYTE (code >> 8);
1998               EMIT_ONE_BYTE (code & 0xFF);
1999             }
2000         }
2001     }
2002   coding->result = CODING_RESULT_SUCCESS;
2003   coding->produced_char += produced_chars;
2004   coding->produced = dst - coding->destination;
2005   return 0;
2006 }
2007
2008 \f
2009 /*** 7. ISO2022 handlers ***/
2010
2011 /* The following note describes the coding system ISO2022 briefly.
2012    Since the intention of this note is to help understand the
2013    functions in this file, some parts are NOT ACCURATE or OVERLY
2014    SIMPLIFIED.  For thorough understanding, please refer to the
2015    original document of ISO2022.
2016
2017    ISO2022 provides many mechanisms to encode several character sets
2018    in 7-bit and 8-bit environments.  For 7-bite environments, all text
2019    is encoded using bytes less than 128.  This may make the encoded
2020    text a little bit longer, but the text passes more easily through
2021    several gateways, some of which strip off MSB (Most Signigant Bit).
2022
2023    There are two kinds of character sets: control character set and
2024    graphic character set.  The former contains control characters such
2025    as `newline' and `escape' to provide control functions (control
2026    functions are also provided by escape sequences).  The latter
2027    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2028    two control character sets and many graphic character sets.
2029
2030    Graphic character sets are classified into one of the following
2031    four classes, according to the number of bytes (DIMENSION) and
2032    number of characters in one dimension (CHARS) of the set:
2033    - DIMENSION1_CHARS94
2034    - DIMENSION1_CHARS96
2035    - DIMENSION2_CHARS94
2036    - DIMENSION2_CHARS96
2037
2038    In addition, each character set is assigned an identification tag,
2039    unique for each set, called "final character" (denoted as <F>
2040    hereafter).  The <F> of each character set is decided by ECMA(*)
2041    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2042    (0x30..0x3F are for private use only).
2043
2044    Note (*): ECMA = European Computer Manufacturers Association
2045
2046    Here are examples of graphic character set [NAME(<F>)]:
2047         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2048         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2049         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2050         o DIMENSION2_CHARS96 -- none for the moment
2051
2052    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2053         C0 [0x00..0x1F] -- control character plane 0
2054         GL [0x20..0x7F] -- graphic character plane 0
2055         C1 [0x80..0x9F] -- control character plane 1
2056         GR [0xA0..0xFF] -- graphic character plane 1
2057
2058    A control character set is directly designated and invoked to C0 or
2059    C1 by an escape sequence.  The most common case is that:
2060    - ISO646's  control character set is designated/invoked to C0, and
2061    - ISO6429's control character set is designated/invoked to C1,
2062    and usually these designations/invocations are omitted in encoded
2063    text.  In a 7-bit environment, only C0 can be used, and a control
2064    character for C1 is encoded by an appropriate escape sequence to
2065    fit into the environment.  All control characters for C1 are
2066    defined to have corresponding escape sequences.
2067
2068    A graphic character set is at first designated to one of four
2069    graphic registers (G0 through G3), then these graphic registers are
2070    invoked to GL or GR.  These designations and invocations can be
2071    done independently.  The most common case is that G0 is invoked to
2072    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2073    these invocations and designations are omitted in encoded text.
2074    In a 7-bit environment, only GL can be used.
2075
2076    When a graphic character set of CHARS94 is invoked to GL, codes
2077    0x20 and 0x7F of the GL area work as control characters SPACE and
2078    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2079    be used.
2080
2081    There are two ways of invocation: locking-shift and single-shift.
2082    With locking-shift, the invocation lasts until the next different
2083    invocation, whereas with single-shift, the invocation affects the
2084    following character only and doesn't affect the locking-shift
2085    state.  Invocations are done by the following control characters or
2086    escape sequences:
2087
2088    ----------------------------------------------------------------------
2089    abbrev  function                  cntrl escape seq   description
2090    ----------------------------------------------------------------------
2091    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2092    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2093    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2094    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2095    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2096    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2097    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2098    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2099    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2100    ----------------------------------------------------------------------
2101    (*) These are not used by any known coding system.
2102
2103    Control characters for these functions are defined by macros
2104    ISO_CODE_XXX in `coding.h'.
2105
2106    Designations are done by the following escape sequences:
2107    ----------------------------------------------------------------------
2108    escape sequence      description
2109    ----------------------------------------------------------------------
2110    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2111    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2112    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2113    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2114    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2115    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2116    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2117    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2118    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2119    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2120    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2121    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2122    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2123    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2124    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2125    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2126    ----------------------------------------------------------------------
2127
2128    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2129    of dimension 1, chars 94, and final character <F>, etc...
2130
2131    Note (*): Although these designations are not allowed in ISO2022,
2132    Emacs accepts them on decoding, and produces them on encoding
2133    CHARS96 character sets in a coding system which is characterized as
2134    7-bit environment, non-locking-shift, and non-single-shift.
2135
2136    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2137    '(' must be omitted.  We refer to this as "short-form" hereafter.
2138
2139    Now you may notice that there are a lot of ways for encoding the
2140    same multilingual text in ISO2022.  Actually, there exist many
2141    coding systems such as Compound Text (used in X11's inter client
2142    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2143    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2144    localized platforms), and all of these are variants of ISO2022.
2145
2146    In addition to the above, Emacs handles two more kinds of escape
2147    sequences: ISO6429's direction specification and Emacs' private
2148    sequence for specifying character composition.
2149
2150    ISO6429's direction specification takes the following form:
2151         o CSI ']'      -- end of the current direction
2152         o CSI '0' ']'  -- end of the current direction
2153         o CSI '1' ']'  -- start of left-to-right text
2154         o CSI '2' ']'  -- start of right-to-left text
2155    The control character CSI (0x9B: control sequence introducer) is
2156    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2157
2158    Character composition specification takes the following form:
2159         o ESC '0' -- start relative composition
2160         o ESC '1' -- end composition
2161         o ESC '2' -- start rule-base composition (*)
2162         o ESC '3' -- start relative composition with alternate chars  (**)
2163         o ESC '4' -- start rule-base composition with alternate chars  (**)
2164   Since these are not standard escape sequences of any ISO standard,
2165   the use of them for these meaning is restricted to Emacs only.
2166
2167   (*) This form is used only in Emacs 20.5 and the older versions,
2168   but the newer versions can safely decode it.
2169   (**) This form is used only in Emacs 21.1 and the newer versions,
2170   and the older versions can't decode it.
2171
2172   Here's a list of examples usages of these composition escape
2173   sequences (categorized by `enum composition_method').
2174
2175   COMPOSITION_RELATIVE:
2176         ESC 0 CHAR [ CHAR ] ESC 1
2177   COMPOSITOIN_WITH_RULE:
2178         ESC 2 CHAR [ RULE CHAR ] ESC 1
2179   COMPOSITION_WITH_ALTCHARS:
2180         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2181   COMPOSITION_WITH_RULE_ALTCHARS:
2182         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2183
2184 enum iso_code_class_type iso_code_class[256];
2185
2186 #define SAFE_CHARSET_P(coding, id)      \
2187   ((id) <= (coding)->max_charset_id     \
2188    && (coding)->safe_charsets[id] >= 0)
2189
2190
2191 #define SHIFT_OUT_OK(category)  \
2192   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2193
2194 static void
2195 setup_iso_safe_charsets (Lisp_Object attrs)
2196 {
2197   Lisp_Object charset_list, safe_charsets;
2198   Lisp_Object request;
2199   Lisp_Object reg_usage;
2200   Lisp_Object tail;
2201   int reg94, reg96;
2202   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2203   int max_charset_id;
2204
2205   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2206   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2207       && ! EQ (charset_list, Viso_2022_charset_list))
2208     {
2209       CODING_ATTR_CHARSET_LIST (attrs)
2210         = charset_list = Viso_2022_charset_list;
2211       ASET (attrs, coding_attr_safe_charsets, Qnil);
2212     }
2213
2214   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2215     return;
2216
2217   max_charset_id = 0;
2218   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2219     {
2220       int id = XINT (XCAR (tail));
2221       if (max_charset_id < id)
2222         max_charset_id = id;
2223     }
2224
2225   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2226                                 make_number (255));
2227   request = AREF (attrs, coding_attr_iso_request);
2228   reg_usage = AREF (attrs, coding_attr_iso_usage);
2229   reg94 = XINT (XCAR (reg_usage));
2230   reg96 = XINT (XCDR (reg_usage));
2231
2232   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2233     {
2234       Lisp_Object id;
2235       Lisp_Object reg;
2236       struct charset *charset;
2237
2238       id = XCAR (tail);
2239       charset = CHARSET_FROM_ID (XINT (id));
2240       reg = Fcdr (Fassq (request, id));
2241       if (! NILP (reg))
2242         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2243       else if (charset->iso_chars_96)
2244         {
2245           if (reg96 < 4)
2246             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2247         }
2248       else
2249         {
2250           if (reg94 < 4)
2251             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2252         }
2253     }
2254   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2255 }
2256
2257
2258 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2259    Check if a text is encoded in ISO2022.  If it is, returns an
2260    integer in which appropriate flag bits any of:
2261         CATEGORY_MASK_ISO_7
2262         CATEGORY_MASK_ISO_7_TIGHT
2263         CATEGORY_MASK_ISO_8_1
2264         CATEGORY_MASK_ISO_8_2
2265         CATEGORY_MASK_ISO_7_ELSE
2266         CATEGORY_MASK_ISO_8_ELSE
2267    are set.  If a code which should never appear in ISO2022 is found,
2268    returns 0.  */
2269
2270 static int
2271 detect_coding_iso_2022 (coding, mask)
2272      struct coding_system *coding;
2273      int *mask;
2274 {
2275   unsigned char *src = coding->source, *src_base = src;
2276   unsigned char *src_end = coding->source + coding->src_bytes;
2277   int multibytep = coding->src_multibyte;
2278   int mask_iso = CATEGORY_MASK_ISO;
2279   int mask_found = 0, mask_8bit_found = 0;
2280   int reg[4], shift_out = 0, single_shifting = 0;
2281   int id;
2282   int c, c1;
2283   int consumed_chars = 0;
2284   int i;
2285
2286   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2287     {
2288       struct coding_system *this = &(coding_categories[i]);
2289       Lisp_Object attrs, val;
2290
2291       attrs = CODING_ID_ATTRS (this->id);
2292       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2293           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2294         setup_iso_safe_charsets (attrs);
2295       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2296       this->max_charset_id = XSTRING (val)->size - 1;
2297       this->safe_charsets = (char *) XSTRING (val)->data;
2298     }
2299
2300   /* A coding system of this category is always ASCII compatible.  */
2301   src += coding->head_ascii;
2302
2303   reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2304   while (mask_iso && src < src_end)
2305     {
2306       ONE_MORE_BYTE (c);
2307       switch (c)
2308         {
2309         case ISO_CODE_ESC:
2310           if (inhibit_iso_escape_detection)
2311             break;
2312           single_shifting = 0;
2313           ONE_MORE_BYTE (c);
2314           if (c >= '(' && c <= '/')
2315             {
2316               /* Designation sequence for a charset of dimension 1.  */
2317               ONE_MORE_BYTE (c1);
2318               if (c1 < ' ' || c1 >= 0x80
2319                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2320                 /* Invalid designation sequence.  Just ignore.  */
2321                 break;
2322               reg[(c - '(') % 4] = id;
2323             }
2324           else if (c == '$')
2325             {
2326               /* Designation sequence for a charset of dimension 2.  */
2327               ONE_MORE_BYTE (c);
2328               if (c >= '@' && c <= 'B')
2329                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2330                 reg[0] = id = iso_charset_table[1][0][c];
2331               else if (c >= '(' && c <= '/')
2332                 {
2333                   ONE_MORE_BYTE (c1);
2334                   if (c1 < ' ' || c1 >= 0x80
2335                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2336                     /* Invalid designation sequence.  Just ignore.  */
2337                     break;
2338                   reg[(c - '(') % 4] = id;
2339                 }
2340               else
2341                 /* Invalid designation sequence.  Just ignore.  */
2342                 break;
2343             }
2344           else if (c == 'N' || c == 'O')
2345             {
2346               /* ESC <Fe> for SS2 or SS3.  */
2347               mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2348               break;
2349             }
2350           else if (c >= '0' && c <= '4')
2351             {
2352               /* ESC <Fp> for start/end composition.  */
2353               mask_found |= CATEGORY_MASK_ISO;
2354               break;
2355             }
2356           else
2357             {
2358               /* Invalid escape sequence.  */
2359               mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2360               break;
2361             }
2362
2363           /* We found a valid designation sequence for CHARSET.  */
2364           mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2365           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2366                               id))
2367             mask_found |= CATEGORY_MASK_ISO_7;
2368           else
2369             mask_iso &= ~CATEGORY_MASK_ISO_7;
2370           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2371                               id))
2372             mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2373           else
2374             mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2375           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2376                               id))
2377             mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2378           else
2379             mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2380           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2381                               id))
2382             mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2383           else
2384             mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2385           break;
2386
2387         case ISO_CODE_SO:
2388           if (inhibit_iso_escape_detection)
2389             break;
2390           single_shifting = 0;
2391           if (shift_out == 0
2392               && (reg[1] >= 0
2393                   || SHIFT_OUT_OK (coding_category_iso_7_else)
2394                   || SHIFT_OUT_OK (coding_category_iso_8_else)))
2395             {
2396               /* Locking shift out.  */
2397               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2398               mask_found |= CATEGORY_MASK_ISO_ELSE;
2399             }
2400           break;
2401
2402         case ISO_CODE_SI:
2403           if (inhibit_iso_escape_detection)
2404             break;
2405           single_shifting = 0;
2406           if (shift_out == 1)
2407             {
2408               /* Locking shift in.  */
2409               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2410               mask_found |= CATEGORY_MASK_ISO_ELSE;
2411             }
2412           break;
2413
2414         case ISO_CODE_CSI:
2415           single_shifting = 0;
2416         case ISO_CODE_SS2:
2417         case ISO_CODE_SS3:
2418           {
2419             int newmask = CATEGORY_MASK_ISO_8_ELSE;
2420
2421             if (inhibit_iso_escape_detection)
2422               break;
2423             if (c != ISO_CODE_CSI)
2424               {
2425                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2426                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2427                   newmask |= CATEGORY_MASK_ISO_8_1;
2428                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2429                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2430                   newmask |= CATEGORY_MASK_ISO_8_2;
2431                 single_shifting = 1;
2432               }
2433             if (VECTORP (Vlatin_extra_code_table)
2434                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2435               {
2436                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2437                     & CODING_ISO_FLAG_LATIN_EXTRA)
2438                   newmask |= CATEGORY_MASK_ISO_8_1;
2439                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2440                     & CODING_ISO_FLAG_LATIN_EXTRA)
2441                   newmask |= CATEGORY_MASK_ISO_8_2;
2442               }
2443             mask_iso &= newmask;
2444             mask_found |= newmask;
2445           }
2446           break;
2447
2448         default:
2449           if (c < 0x80)
2450             {
2451               single_shifting = 0;
2452               break;
2453             }
2454           else if (c < 0xA0)
2455             {
2456               single_shifting = 0;
2457               mask_8bit_found = 1;
2458               if (VECTORP (Vlatin_extra_code_table)
2459                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2460                 {
2461                   int newmask = 0;
2462
2463                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2464                       & CODING_ISO_FLAG_LATIN_EXTRA)
2465                     newmask |= CATEGORY_MASK_ISO_8_1;
2466                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2467                       & CODING_ISO_FLAG_LATIN_EXTRA)
2468                     newmask |= CATEGORY_MASK_ISO_8_2;
2469                   mask_iso &= newmask;
2470                   mask_found |= newmask;
2471                 }
2472               else
2473                 return 0;
2474             }
2475           else
2476             {
2477               mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2478                             | CATEGORY_MASK_ISO_7_ELSE);
2479               mask_found |= CATEGORY_MASK_ISO_8_1;
2480               mask_8bit_found = 1;
2481               /* Check the length of succeeding codes of the range
2482                  0xA0..0FF.  If the byte length is odd, we exclude
2483                  CATEGORY_MASK_ISO_8_2.  We can check this only
2484                  when we are not single shifting.  */
2485               if (!single_shifting
2486                   && mask_iso & CATEGORY_MASK_ISO_8_2)
2487                 {
2488                   int i = 1;
2489                   while (src < src_end)
2490                     {
2491                       ONE_MORE_BYTE (c);
2492                       if (c < 0xA0)
2493                         break;
2494                       i++;
2495                     }
2496
2497                   if (i & 1 && src < src_end)
2498                     mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2499                   else
2500                     mask_found |= CATEGORY_MASK_ISO_8_2;
2501                 }
2502             }
2503           break;
2504         }
2505     }
2506  no_more_source:
2507   if (!mask_iso)
2508     {
2509       *mask &= ~CATEGORY_MASK_ISO;
2510       return 0;
2511     }
2512   if (!mask_found)
2513     return 0;
2514   *mask &= mask_iso & mask_found;
2515   if (! mask_8bit_found)
2516     *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2517   return 1;
2518 }
2519
2520
2521 /* Set designation state into CODING.  */
2522 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2523   do {                                                                  \
2524     int id, prev;                                                       \
2525                                                                         \
2526     if (final < '0' || final >= 128                                     \
2527         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2528         || !SAFE_CHARSET_P (coding, id))                                \
2529       {                                                                 \
2530         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2531         goto invalid_code;                                              \
2532       }                                                                 \
2533     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2534     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2535     /* If there was an invalid designation to REG previously, and this  \
2536        designation is ASCII to REG, we should keep this designation     \
2537        sequence.  */                                                    \
2538     if (prev == -2 && id == charset_ascii)                              \
2539       goto invalid_code;                                                \
2540   } while (0)
2541
2542
2543 #define MAYBE_FINISH_COMPOSITION()                              \
2544   do {                                                          \
2545     int i;                                                      \
2546     if (composition_state == COMPOSING_NO)                      \
2547       break;                                                    \
2548     /* It is assured that we have enough room for producing     \
2549        characters stored in the table `components'.  */         \
2550     if (charbuf + component_idx > charbuf_end)                  \
2551       goto no_more_source;                                      \
2552     composition_state = COMPOSING_NO;                           \
2553     if (method == COMPOSITION_RELATIVE                          \
2554         || method == COMPOSITION_WITH_ALTCHARS)                 \
2555       {                                                         \
2556         for (i = 0; i < component_idx; i++)                     \
2557           *charbuf++ = components[i];                           \
2558         char_offset += component_idx;                           \
2559       }                                                         \
2560     else                                                        \
2561       {                                                         \
2562         for (i = 0; i < component_idx; i += 2)                  \
2563           *charbuf++ = components[i];                           \
2564         char_offset += (component_idx / 2) + 1;                 \
2565       }                                                         \
2566   } while (0)
2567
2568
2569 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2570    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2571    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2572    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2573    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2574   */
2575
2576 #define DECODE_COMPOSITION_START(c1)                                    \
2577   do {                                                                  \
2578     if (c1 == '0'                                                       \
2579         && composition_state == COMPOSING_COMPONENT_CHAR)               \
2580       {                                                                 \
2581         component_len = component_idx;                                  \
2582         composition_state = COMPOSING_CHAR;                             \
2583       }                                                                 \
2584     else                                                                \
2585       {                                                                 \
2586         unsigned char *p;                                               \
2587                                                                         \
2588         MAYBE_FINISH_COMPOSITION ();                                    \
2589         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2590           goto no_more_source;                                          \
2591         for (p = src; p < src_end - 1; p++)                             \
2592           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2593             break;                                                      \
2594         if (p == src_end - 1)                                           \
2595           {                                                             \
2596             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2597               goto invalid_code;                                        \
2598             goto no_more_source;                                        \
2599           }                                                             \
2600                                                                         \
2601         /* This is surely the start of a composition.  */               \
2602         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2603                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2604                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2605                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2606         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2607                              : COMPOSING_COMPONENT_CHAR);               \
2608         component_idx = component_len = 0;                              \
2609       }                                                                 \
2610   } while (0)
2611
2612
2613 /* Handle compositoin end sequence ESC 1.  */
2614
2615 #define DECODE_COMPOSITION_END()                                        \
2616   do {                                                                  \
2617     int nchars = (component_len > 0 ? component_idx - component_len     \
2618                   : method == COMPOSITION_RELATIVE ? component_idx      \
2619                   : (component_idx + 1) / 2);                           \
2620     int i;                                                              \
2621     int *saved_charbuf = charbuf;                                       \
2622                                                                         \
2623     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
2624     if (method != COMPOSITION_RELATIVE)                                 \
2625       {                                                                 \
2626         if (component_len == 0)                                         \
2627           for (i = 0; i < component_idx; i++)                           \
2628             *charbuf++ = components[i];                                 \
2629         else                                                            \
2630           for (i = 0; i < component_len; i++)                           \
2631             *charbuf++ = components[i];                                 \
2632         *saved_charbuf = saved_charbuf - charbuf;                       \
2633       }                                                                 \
2634     if (method == COMPOSITION_WITH_RULE)                                \
2635       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2636         *charbuf++ = components[i];                                     \
2637     else                                                                \
2638       for (i = component_len; i < component_idx; i++, char_offset++)    \
2639         *charbuf++ = components[i];                                     \
2640     coding->annotated = 1;                                              \
2641     composition_state = COMPOSING_NO;                                   \
2642   } while (0)
2643
2644
2645 /* Decode a composition rule from the byte C1 (and maybe one more byte
2646    from SRC) and store one encoded composition rule in
2647    coding->cmp_data.  */
2648
2649 #define DECODE_COMPOSITION_RULE(c1)                                     \
2650   do {                                                                  \
2651     (c1) -= 32;                                                         \
2652     if (c1 < 81)                /* old format (before ver.21) */        \
2653       {                                                                 \
2654         int gref = (c1) / 9;                                            \
2655         int nref = (c1) % 9;                                            \
2656         if (gref == 4) gref = 10;                                       \
2657         if (nref == 4) nref = 10;                                       \
2658         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2659       }                                                                 \
2660     else if (c1 < 93)           /* new format (after ver.21) */         \
2661       {                                                                 \
2662         ONE_MORE_BYTE (c2);                                             \
2663         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2664       }                                                                 \
2665     else                                                                \
2666       c1 = 0;                                                           \
2667   } while (0)
2668
2669
2670 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2671
2672 static void
2673 decode_coding_iso_2022 (coding)
2674      struct coding_system *coding;
2675 {
2676   unsigned char *src = coding->source + coding->consumed;
2677   unsigned char *src_end = coding->source + coding->src_bytes;
2678   unsigned char *src_base;
2679   int *charbuf = coding->charbuf;
2680   int *charbuf_end = charbuf + coding->charbuf_size - 4;
2681   int consumed_chars = 0, consumed_chars_base;
2682   int char_offset = 0;
2683   int multibytep = coding->src_multibyte;
2684   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2685   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2686   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2687   struct charset *charset;
2688   int c;
2689   /* For handling composition sequence.  */
2690 #define COMPOSING_NO                    0
2691 #define COMPOSING_CHAR                  1
2692 #define COMPOSING_RULE                  2
2693 #define COMPOSING_COMPONENT_CHAR        3
2694 #define COMPOSING_COMPONENT_RULE        4
2695
2696   int composition_state = COMPOSING_NO;
2697   enum composition_method method;
2698   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2699   int component_idx;
2700   int component_len;
2701   Lisp_Object attrs, eol_type, charset_list;
2702
2703   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2704   setup_iso_safe_charsets (attrs);
2705
2706   while (1)
2707     {
2708       int c1, c2;
2709
2710       src_base = src;
2711       consumed_chars_base = consumed_chars;
2712
2713       if (charbuf >= charbuf_end)
2714         break;
2715
2716       ONE_MORE_BYTE (c1);
2717
2718       /* We produce no character or one character.  */
2719       switch (iso_code_class [c1])
2720         {
2721         case ISO_0x20_or_0x7F:
2722           if (composition_state != COMPOSING_NO)
2723             {
2724               if (composition_state == COMPOSING_RULE
2725                   || composition_state == COMPOSING_COMPONENT_RULE)
2726                 {
2727                   DECODE_COMPOSITION_RULE (c1);
2728                   components[component_idx++] = c1;
2729                   composition_state--;
2730                   continue;
2731                 }
2732               else if (method == COMPOSITION_WITH_RULE)
2733                 composition_state = COMPOSING_RULE;
2734               else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2735                        && composition_state == COMPOSING_COMPONENT_CHAR)
2736                 composition_state = COMPOSING_COMPONENT_CHAR;
2737             }
2738           if (charset_id_0 < 0
2739               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2740             {
2741               /* This is SPACE or DEL.  */
2742               charset = CHARSET_FROM_ID (charset_ascii);
2743               break;
2744             }
2745           /* This is a graphic character, we fall down ...  */
2746
2747         case ISO_graphic_plane_0:
2748           if (composition_state == COMPOSING_RULE)
2749             {
2750               DECODE_COMPOSITION_RULE (c1);
2751               components[component_idx++] = c1;
2752               composition_state = COMPOSING_CHAR;
2753             }
2754           charset = CHARSET_FROM_ID (charset_id_0);
2755           break;
2756
2757         case ISO_0xA0_or_0xFF:
2758           if (charset_id_1 < 0
2759               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2760               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2761             goto invalid_code;
2762           /* This is a graphic character, we fall down ... */
2763
2764         case ISO_graphic_plane_1:
2765           if (charset_id_1 < 0)
2766             goto invalid_code;
2767           charset = CHARSET_FROM_ID (charset_id_1);
2768           break;
2769
2770         case ISO_carriage_return:
2771           if (c1 == '\r')
2772             {
2773               if (EQ (eol_type, Qdos))
2774                 {
2775                   if (src == src_end)
2776                     goto no_more_source;
2777                   if (*src == '\n')
2778                     ONE_MORE_BYTE (c1);
2779                 }
2780               else if (EQ (eol_type, Qmac))
2781                 c1 = '\n';
2782             }
2783           /* fall through */
2784
2785         case ISO_control_0:
2786           MAYBE_FINISH_COMPOSITION ();
2787           charset = CHARSET_FROM_ID (charset_ascii);
2788           break;
2789
2790         case ISO_control_1:
2791           MAYBE_FINISH_COMPOSITION ();
2792           goto invalid_code;
2793
2794         case ISO_shift_out:
2795           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2796               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2797             goto invalid_code;
2798           CODING_ISO_INVOCATION (coding, 0) = 1;
2799           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2800           continue;
2801
2802         case ISO_shift_in:
2803           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2804             goto invalid_code;
2805           CODING_ISO_INVOCATION (coding, 0) = 0;
2806           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2807           continue;
2808
2809         case ISO_single_shift_2_7:
2810         case ISO_single_shift_2:
2811           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2812             goto invalid_code;
2813           /* SS2 is handled as an escape sequence of ESC 'N' */
2814           c1 = 'N';
2815           goto label_escape_sequence;
2816
2817         case ISO_single_shift_3:
2818           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2819             goto invalid_code;
2820           /* SS2 is handled as an escape sequence of ESC 'O' */
2821           c1 = 'O';
2822           goto label_escape_sequence;
2823
2824         case ISO_control_sequence_introducer:
2825           /* CSI is handled as an escape sequence of ESC '[' ...  */
2826           c1 = '[';
2827           goto label_escape_sequence;
2828
2829         case ISO_escape:
2830           ONE_MORE_BYTE (c1);
2831         label_escape_sequence:
2832           /* Escape sequences handled here are invocation,
2833              designation, direction specification, and character
2834              composition specification.  */
2835           switch (c1)
2836             {
2837             case '&':           /* revision of following character set */
2838               ONE_MORE_BYTE (c1);
2839               if (!(c1 >= '@' && c1 <= '~'))
2840                 goto invalid_code;
2841               ONE_MORE_BYTE (c1);
2842               if (c1 != ISO_CODE_ESC)
2843                 goto invalid_code;
2844               ONE_MORE_BYTE (c1);
2845               goto label_escape_sequence;
2846
2847             case '$':           /* designation of 2-byte character set */
2848               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2849                 goto invalid_code;
2850               ONE_MORE_BYTE (c1);
2851               if (c1 >= '@' && c1 <= 'B')
2852                 {       /* designation of JISX0208.1978, GB2312.1980,
2853                            or JISX0208.1980 */
2854                   DECODE_DESIGNATION (0, 2, 0, c1);
2855                 }
2856               else if (c1 >= 0x28 && c1 <= 0x2B)
2857                 {       /* designation of DIMENSION2_CHARS94 character set */
2858                   ONE_MORE_BYTE (c2);
2859                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2860                 }
2861               else if (c1 >= 0x2C && c1 <= 0x2F)
2862                 {       /* designation of DIMENSION2_CHARS96 character set */
2863                   ONE_MORE_BYTE (c2);
2864                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2865                 }
2866               else
2867                 goto invalid_code;
2868               /* We must update these variables now.  */
2869               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2870               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2871               continue;
2872
2873             case 'n':           /* invocation of locking-shift-2 */
2874               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2875                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2876                 goto invalid_code;
2877               CODING_ISO_INVOCATION (coding, 0) = 2;
2878               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2879               continue;
2880
2881             case 'o':           /* invocation of locking-shift-3 */
2882               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2883                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2884                 goto invalid_code;
2885               CODING_ISO_INVOCATION (coding, 0) = 3;
2886               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2887               continue;
2888
2889             case 'N':           /* invocation of single-shift-2 */
2890               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2891                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2892                 goto invalid_code;
2893               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2894               ONE_MORE_BYTE (c1);
2895               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2896                 goto invalid_code;
2897               break;
2898
2899             case 'O':           /* invocation of single-shift-3 */
2900               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2901                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2902                 goto invalid_code;
2903               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2904               ONE_MORE_BYTE (c1);
2905               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2906                 goto invalid_code;
2907               break;
2908
2909             case '0': case '2': case '3': case '4': /* start composition */
2910               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2911                 goto invalid_code;
2912               DECODE_COMPOSITION_START (c1);
2913               continue;
2914
2915             case '1':           /* end composition */
2916               if (composition_state == COMPOSING_NO)
2917                 goto invalid_code;
2918               DECODE_COMPOSITION_END ();
2919               continue;
2920
2921             case '[':           /* specification of direction */
2922               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2923                 goto invalid_code;
2924               /* For the moment, nested direction is not supported.
2925                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2926                  left-to-right, and nozero means right-to-left.  */
2927               ONE_MORE_BYTE (c1);
2928               switch (c1)
2929                 {
2930                 case ']':       /* end of the current direction */
2931                   coding->mode &= ~CODING_MODE_DIRECTION;
2932
2933                 case '0':       /* end of the current direction */
2934                 case '1':       /* start of left-to-right direction */
2935                   ONE_MORE_BYTE (c1);
2936                   if (c1 == ']')
2937                     coding->mode &= ~CODING_MODE_DIRECTION;
2938                   else
2939                     goto invalid_code;
2940                   break;
2941
2942                 case '2':       /* start of right-to-left direction */
2943                   ONE_MORE_BYTE (c1);
2944                   if (c1 == ']')
2945                     coding->mode |= CODING_MODE_DIRECTION;
2946                   else
2947                     goto invalid_code;
2948                   break;
2949
2950                 default:
2951                   goto invalid_code;
2952                 }
2953               continue;
2954
2955             default:
2956               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2957                 goto invalid_code;
2958               if (c1 >= 0x28 && c1 <= 0x2B)
2959                 {       /* designation of DIMENSION1_CHARS94 character set */
2960                   ONE_MORE_BYTE (c2);
2961                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
2962                 }
2963               else if (c1 >= 0x2C && c1 <= 0x2F)
2964                 {       /* designation of DIMENSION1_CHARS96 character set */
2965                   ONE_MORE_BYTE (c2);
2966                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
2967                 }
2968               else
2969                 goto invalid_code;
2970               /* We must update these variables now.  */
2971               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2972               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2973               continue;
2974             }
2975         }
2976
2977       /* Now we know CHARSET and 1st position code C1 of a character.
2978          Produce a decoded character while getting 2nd position code
2979          C2 if necessary.  */
2980       c1 &= 0x7F;
2981       if (CHARSET_DIMENSION (charset) > 1)
2982         {
2983           ONE_MORE_BYTE (c2);
2984           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2985             /* C2 is not in a valid range.  */
2986             goto invalid_code;
2987           c1 = (c1 << 8) | (c2 & 0x7F);
2988           if (CHARSET_DIMENSION (charset) > 2)
2989             {
2990               ONE_MORE_BYTE (c2);
2991               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2992                 /* C2 is not in a valid range.  */
2993                 goto invalid_code;
2994               c1 = (c1 << 8) | (c2 & 0x7F);
2995             }
2996         }
2997
2998       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
2999       if (c < 0)
3000         {
3001           MAYBE_FINISH_COMPOSITION ();
3002           for (; src_base < src; src_base++, char_offset++)
3003             {
3004               if (ASCII_BYTE_P (*src_base))
3005                 *charbuf++ = *src_base;
3006               else
3007                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3008             }
3009         }
3010       else if (composition_state == COMPOSING_NO)
3011         {
3012           *charbuf++ = c;
3013           char_offset++;
3014         }
3015       else
3016         components[component_idx++] = c;
3017       continue;
3018
3019     invalid_code:
3020       MAYBE_FINISH_COMPOSITION ();
3021       src = src_base;
3022       consumed_chars = consumed_chars_base;
3023       ONE_MORE_BYTE (c);
3024       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3025       coding->errors++;
3026     }
3027
3028  no_more_source:
3029   coding->consumed_char += consumed_chars_base;
3030   coding->consumed = src_base - coding->source;
3031   coding->charbuf_used = charbuf - coding->charbuf;
3032 }
3033
3034
3035 /* ISO2022 encoding stuff.  */
3036
3037 /*
3038    It is not enough to say just "ISO2022" on encoding, we have to
3039    specify more details.  In Emacs, each coding system of ISO2022
3040    variant has the following specifications:
3041         1. Initial designation to G0 thru G3.
3042         2. Allows short-form designation?
3043         3. ASCII should be designated to G0 before control characters?
3044         4. ASCII should be designated to G0 at end of line?
3045         5. 7-bit environment or 8-bit environment?
3046         6. Use locking-shift?
3047         7. Use Single-shift?
3048    And the following two are only for Japanese:
3049         8. Use ASCII in place of JIS0201-1976-Roman?
3050         9. Use JISX0208-1983 in place of JISX0208-1978?
3051    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3052    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3053    details.
3054 */
3055
3056 /* Produce codes (escape sequence) for designating CHARSET to graphic
3057    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3058    '@', 'A', or 'B' and the coding system CODING allows, produce
3059    designation sequence of short-form.  */
3060
3061 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3062   do {                                                                  \
3063     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3064     char *intermediate_char_94 = "()*+";                                \
3065     char *intermediate_char_96 = ",-./";                                \
3066     int revision = -1;                                                  \
3067     int c;                                                              \
3068                                                                         \
3069     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3070       revision = XINT (CHARSET_ISO_REVISION (charset));                 \
3071                                                                         \
3072     if (revision >= 0)                                                  \
3073       {                                                                 \
3074         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3075         EMIT_ONE_BYTE ('@' + revision);                                 \
3076       }                                                                 \
3077     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3078     if (CHARSET_DIMENSION (charset) == 1)                               \
3079       {                                                                 \
3080         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3081           c = intermediate_char_94[reg];                                \
3082         else                                                            \
3083           c = intermediate_char_96[reg];                                \
3084         EMIT_ONE_ASCII_BYTE (c);                                        \
3085       }                                                                 \
3086     else                                                                \
3087       {                                                                 \
3088         EMIT_ONE_ASCII_BYTE ('$');                                      \
3089         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3090           {                                                             \
3091             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3092                 || reg != 0                                             \
3093                 || final_char < '@' || final_char > 'B')                \
3094               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3095           }                                                             \
3096         else                                                            \
3097           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3098       }                                                                 \
3099     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3100                                                                         \
3101     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3102   } while (0)
3103
3104
3105 /* The following two macros produce codes (control character or escape
3106    sequence) for ISO2022 single-shift functions (single-shift-2 and
3107    single-shift-3).  */
3108
3109 #define ENCODE_SINGLE_SHIFT_2                                           \
3110   do {                                                                  \
3111     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3112       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3113     else                                                                \
3114       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3115     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3116   } while (0)
3117
3118
3119 #define ENCODE_SINGLE_SHIFT_3                                           \
3120   do {                                                                  \
3121     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3122       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3123     else                                                                \
3124       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3125     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3126   } while (0)
3127
3128
3129 /* The following four macros produce codes (control character or
3130    escape sequence) for ISO2022 locking-shift functions (shift-in,
3131    shift-out, locking-shift-2, and locking-shift-3).  */
3132
3133 #define ENCODE_SHIFT_IN                                 \
3134   do {                                                  \
3135     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3136     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3137   } while (0)
3138
3139
3140 #define ENCODE_SHIFT_OUT                                \
3141   do {                                                  \
3142     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3143     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3144   } while (0)
3145
3146
3147 #define ENCODE_LOCKING_SHIFT_2                          \
3148   do {                                                  \
3149     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3150     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3151   } while (0)
3152
3153
3154 #define ENCODE_LOCKING_SHIFT_3                          \
3155   do {                                                  \
3156     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3157     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3158   } while (0)
3159
3160
3161 /* Produce codes for a DIMENSION1 character whose character set is
3162    CHARSET and whose position-code is C1.  Designation and invocation
3163    sequences are also produced in advance if necessary.  */
3164
3165 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3166   do {                                                                  \
3167     int id = CHARSET_ID (charset);                                      \
3168     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3169       {                                                                 \
3170         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3171           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3172         else                                                            \
3173           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3174         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3175         break;                                                          \
3176       }                                                                 \
3177     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3178       {                                                                 \
3179         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3180         break;                                                          \
3181       }                                                                 \
3182     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3183       {                                                                 \
3184         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3185         break;                                                          \
3186       }                                                                 \
3187     else                                                                \
3188       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3189          must invoke it, or, at first, designate it to some graphic     \
3190          register.  Then repeat the loop to actually produce the        \
3191          character.  */                                                 \
3192       dst = encode_invocation_designation (charset, coding, dst,        \
3193                                            &produced_chars);            \
3194   } while (1)
3195
3196
3197 /* Produce codes for a DIMENSION2 character whose character set is
3198    CHARSET and whose position-codes are C1 and C2.  Designation and
3199    invocation codes are also produced in advance if necessary.  */
3200
3201 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3202   do {                                                                  \
3203     int id = CHARSET_ID (charset);                                      \
3204     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3205       {                                                                 \
3206         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3207           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3208         else                                                            \
3209           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3210         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3211         break;                                                          \
3212       }                                                                 \
3213     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3214       {                                                                 \
3215         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3216         break;                                                          \
3217       }                                                                 \
3218     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3219       {                                                                 \
3220         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3221         break;                                                          \
3222       }                                                                 \
3223     else                                                                \
3224       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3225          must invoke it, or, at first, designate it to some graphic     \
3226          register.  Then repeat the loop to actually produce the        \
3227          character.  */                                                 \
3228       dst = encode_invocation_designation (charset, coding, dst,        \
3229                                            &produced_chars);            \
3230   } while (1)
3231
3232
3233 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3234   do {                                                                     \
3235     int code = ENCODE_CHAR ((charset),(c));                                \
3236                                                                            \
3237     if (CHARSET_DIMENSION (charset) == 1)                                  \
3238       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3239     else                                                                   \
3240       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3241   } while (0)
3242
3243
3244 /* Produce designation and invocation codes at a place pointed by DST
3245    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3246    Return new DST.  */
3247
3248 unsigned char *
3249 encode_invocation_designation (charset, coding, dst, p_nchars)
3250      struct charset *charset;
3251      struct coding_system *coding;
3252      unsigned char *dst;
3253      int *p_nchars;
3254 {
3255   int multibytep = coding->dst_multibyte;
3256   int produced_chars = *p_nchars;
3257   int reg;                      /* graphic register number */
3258   int id = CHARSET_ID (charset);
3259
3260   /* At first, check designations.  */
3261   for (reg = 0; reg < 4; reg++)
3262     if (id == CODING_ISO_DESIGNATION (coding, reg))
3263       break;
3264
3265   if (reg >= 4)
3266     {
3267       /* CHARSET is not yet designated to any graphic registers.  */
3268       /* At first check the requested designation.  */
3269       reg = CODING_ISO_REQUEST (coding, id);
3270       if (reg < 0)
3271         /* Since CHARSET requests no special designation, designate it
3272            to graphic register 0.  */
3273         reg = 0;
3274
3275       ENCODE_DESIGNATION (charset, reg, coding);
3276     }
3277
3278   if (CODING_ISO_INVOCATION (coding, 0) != reg
3279       && CODING_ISO_INVOCATION (coding, 1) != reg)
3280     {
3281       /* Since the graphic register REG is not invoked to any graphic
3282          planes, invoke it to graphic plane 0.  */
3283       switch (reg)
3284         {
3285         case 0:                 /* graphic register 0 */
3286           ENCODE_SHIFT_IN;
3287           break;
3288
3289         case 1:                 /* graphic register 1 */
3290           ENCODE_SHIFT_OUT;
3291           break;
3292
3293         case 2:                 /* graphic register 2 */
3294           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3295             ENCODE_SINGLE_SHIFT_2;
3296           else
3297             ENCODE_LOCKING_SHIFT_2;
3298           break;
3299
3300         case 3:                 /* graphic register 3 */
3301           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3302             ENCODE_SINGLE_SHIFT_3;
3303           else
3304             ENCODE_LOCKING_SHIFT_3;
3305           break;
3306         }
3307     }
3308
3309   *p_nchars = produced_chars;
3310   return dst;
3311 }
3312
3313 /* The following three macros produce codes for indicating direction
3314    of text.  */
3315 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3316   do {                                                                  \
3317     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3318       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3319     else                                                                \
3320       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3321   } while (0)
3322
3323
3324 #define ENCODE_DIRECTION_R2L()                  \
3325   do {                                          \
3326     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3327     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3328   } while (0)
3329
3330
3331 #define ENCODE_DIRECTION_L2R()                  \
3332   do {                                          \
3333     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3334     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3335   } while (0)
3336
3337
3338 /* Produce codes for designation and invocation to reset the graphic
3339    planes and registers to initial state.  */
3340 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3341   do {                                                                  \
3342     int reg;                                                            \
3343     struct charset *charset;                                            \
3344                                                                         \
3345     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3346       ENCODE_SHIFT_IN;                                                  \
3347     for (reg = 0; reg < 4; reg++)                                       \
3348       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3349           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3350               != CODING_ISO_INITIAL (coding, reg)))                     \
3351         {                                                               \
3352           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3353           ENCODE_DESIGNATION (charset, reg, coding);                    \
3354         }                                                               \
3355   } while (0)
3356
3357
3358 /* Produce designation sequences of charsets in the line started from
3359    SRC to a place pointed by DST, and return updated DST.
3360
3361    If the current block ends before any end-of-line, we may fail to
3362    find all the necessary designations.  */
3363
3364 static unsigned char *
3365 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3366      struct coding_system *coding;
3367      int *charbuf, *charbuf_end;
3368      unsigned char *dst;
3369 {
3370   struct charset *charset;
3371   /* Table of charsets to be designated to each graphic register.  */
3372   int r[4];
3373   int c, found = 0, reg;
3374   int produced_chars = 0;
3375   int multibytep = coding->dst_multibyte;
3376   Lisp_Object attrs;
3377   Lisp_Object charset_list;
3378
3379   attrs = CODING_ID_ATTRS (coding->id);
3380   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3381   if (EQ (charset_list, Qiso_2022))
3382     charset_list = Viso_2022_charset_list;
3383
3384   for (reg = 0; reg < 4; reg++)
3385     r[reg] = -1;
3386
3387   while (found < 4)
3388     {
3389       int id;
3390
3391       c = *charbuf++;
3392       if (c == '\n')
3393         break;
3394       charset = char_charset (c, charset_list, NULL);
3395       id = CHARSET_ID (charset);
3396       reg = CODING_ISO_REQUEST (coding, id);
3397       if (reg >= 0 && r[reg] < 0)
3398         {
3399           found++;
3400           r[reg] = id;
3401         }
3402     }
3403
3404   if (found)
3405     {
3406       for (reg = 0; reg < 4; reg++)
3407         if (r[reg] >= 0
3408             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3409           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3410     }
3411
3412   return dst;
3413 }
3414
3415 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3416
3417 static int
3418 encode_coding_iso_2022 (coding)
3419      struct coding_system *coding;
3420 {
3421   int multibytep = coding->dst_multibyte;
3422   int *charbuf = coding->charbuf;
3423   int *charbuf_end = charbuf + coding->charbuf_used;
3424   unsigned char *dst = coding->destination + coding->produced;
3425   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3426   int safe_room = 16;
3427   int bol_designation
3428     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3429        && CODING_ISO_BOL (coding));
3430   int produced_chars = 0;
3431   Lisp_Object attrs, eol_type, charset_list;
3432   int ascii_compatible;
3433   int c;
3434
3435   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3436
3437   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3438
3439   while (charbuf < charbuf_end)
3440     {
3441       ASSURE_DESTINATION (safe_room);
3442
3443       if (bol_designation)
3444         {
3445           unsigned char *dst_prev = dst;
3446
3447           /* We have to produce designation sequences if any now.  */
3448           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3449           bol_designation = 0;
3450           /* We are sure that designation sequences are all ASCII bytes.  */
3451           produced_chars += dst - dst_prev;
3452         }
3453
3454       c = *charbuf++;
3455
3456       /* Now encode the character C.  */
3457       if (c < 0x20 || c == 0x7F)
3458         {
3459           if (c == '\n'
3460               || (c == '\r' && EQ (eol_type, Qmac)))
3461             {
3462               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3463                 ENCODE_RESET_PLANE_AND_REGISTER ();
3464               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3465                 {
3466                   int i;
3467
3468                   for (i = 0; i < 4; i++)
3469                     CODING_ISO_DESIGNATION (coding, i)
3470                       = CODING_ISO_INITIAL (coding, i);
3471                 }
3472               bol_designation
3473                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3474             }
3475           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3476             ENCODE_RESET_PLANE_AND_REGISTER ();
3477           EMIT_ONE_ASCII_BYTE (c);
3478         }
3479       else if (ASCII_CHAR_P (c))
3480         {
3481           if (ascii_compatible)
3482             EMIT_ONE_ASCII_BYTE (c);
3483           else
3484             ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
3485         }
3486       else
3487         {
3488           struct charset *charset = char_charset (c, charset_list, NULL);
3489
3490           if (!charset)
3491             {
3492               c = coding->default_char;
3493               charset = char_charset (c, charset_list, NULL);
3494             }
3495           ENCODE_ISO_CHARACTER (charset, c);
3496         }
3497     }
3498
3499   if (coding->mode & CODING_MODE_LAST_BLOCK
3500       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3501     {
3502       ASSURE_DESTINATION (safe_room);
3503       ENCODE_RESET_PLANE_AND_REGISTER ();
3504     }
3505   coding->result = CODING_RESULT_SUCCESS;
3506   CODING_ISO_BOL (coding) = bol_designation;
3507   coding->produced_char += produced_chars;
3508   coding->produced = dst - coding->destination;
3509   return 0;
3510 }
3511
3512 \f
3513 /*** 8,9. SJIS and BIG5 handlers ***/
3514
3515 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3516    quite widely.  So, for the moment, Emacs supports them in the bare
3517    C code.  But, in the future, they may be supported only by CCL.  */
3518
3519 /* SJIS is a coding system encoding three character sets: ASCII, right
3520    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3521    as is.  A character of charset katakana-jisx0201 is encoded by
3522    "position-code + 0x80".  A character of charset japanese-jisx0208
3523    is encoded in 2-byte but two position-codes are divided and shifted
3524    so that it fit in the range below.
3525
3526    --- CODE RANGE of SJIS ---
3527    (character set)      (range)
3528    ASCII                0x00 .. 0x7F
3529    KATAKANA-JISX0201    0xA0 .. 0xDF
3530    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3531             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3532    -------------------------------
3533
3534 */
3535
3536 /* BIG5 is a coding system encoding two character sets: ASCII and
3537    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3538    character set and is encoded in two-byte.
3539
3540    --- CODE RANGE of BIG5 ---
3541    (character set)      (range)
3542    ASCII                0x00 .. 0x7F
3543    Big5 (1st byte)      0xA1 .. 0xFE
3544         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3545    --------------------------
3546
3547   */
3548
3549 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3550    Check if a text is encoded in SJIS.  If it is, return
3551    CATEGORY_MASK_SJIS, else return 0.  */
3552
3553 static int
3554 detect_coding_sjis (coding, mask)
3555      struct coding_system *coding;
3556      int *mask;
3557 {
3558   unsigned char *src = coding->source, *src_base = src;
3559   unsigned char *src_end = coding->source + coding->src_bytes;
3560   int multibytep = coding->src_multibyte;
3561   int consumed_chars = 0;
3562   int found = 0;
3563   int c;
3564
3565   /* A coding system of this category is always ASCII compatible.  */
3566   src += coding->head_ascii;
3567
3568   while (1)
3569     {
3570       ONE_MORE_BYTE (c);
3571       if (c < 0x80)
3572         continue;
3573       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3574         {
3575           ONE_MORE_BYTE (c);
3576           if (c < 0x40 || c == 0x7F || c > 0xFC)
3577             break;
3578           found = 1;
3579         }
3580       else if (c >= 0xA0 && c < 0xE0)
3581         found = 1;
3582       else
3583         break;
3584     }
3585   *mask &= ~CATEGORY_MASK_SJIS;
3586   return 0;
3587
3588  no_more_source:
3589   if (!found)
3590     return 0;
3591   *mask &= CATEGORY_MASK_SJIS;
3592   return 1;
3593 }
3594
3595 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3596    Check if a text is encoded in BIG5.  If it is, return
3597    CATEGORY_MASK_BIG5, else return 0.  */
3598
3599 static int
3600 detect_coding_big5 (coding, mask)
3601      struct coding_system *coding;
3602      int *mask;
3603 {
3604   unsigned char *src = coding->source, *src_base = src;
3605   unsigned char *src_end = coding->source + coding->src_bytes;
3606   int multibytep = coding->src_multibyte;
3607   int consumed_chars = 0;
3608   int found = 0;
3609   int c;
3610
3611   /* A coding system of this category is always ASCII compatible.  */
3612   src += coding->head_ascii;
3613
3614   while (1)
3615     {
3616       ONE_MORE_BYTE (c);
3617       if (c < 0x80)
3618         continue;
3619       if (c >= 0xA1)
3620         {
3621           ONE_MORE_BYTE (c);
3622           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3623             return 0;
3624           found = 1;
3625         }
3626       else
3627         break;
3628     }
3629   *mask &= ~CATEGORY_MASK_BIG5;
3630   return 0;
3631
3632  no_more_source:
3633   if (!found)
3634     return 0;
3635   *mask &= CATEGORY_MASK_BIG5;
3636   return 1;
3637 }
3638
3639 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3640    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3641
3642 static void
3643 decode_coding_sjis (coding)
3644      struct coding_system *coding;
3645 {
3646   unsigned char *src = coding->source + coding->consumed;
3647   unsigned char *src_end = coding->source + coding->src_bytes;
3648   unsigned char *src_base;
3649   int *charbuf = coding->charbuf;
3650   int *charbuf_end = charbuf + coding->charbuf_size;
3651   int consumed_chars = 0, consumed_chars_base;
3652   int multibytep = coding->src_multibyte;
3653   struct charset *charset_roman, *charset_kanji, *charset_kana;
3654   Lisp_Object attrs, eol_type, charset_list, val;
3655
3656   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3657
3658   val = charset_list;
3659   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3660   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3661   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3662
3663   while (1)
3664     {
3665       int c, c1;
3666
3667       src_base = src;
3668       consumed_chars_base = consumed_chars;
3669
3670       if (charbuf >= charbuf_end)
3671         break;
3672
3673       ONE_MORE_BYTE (c);
3674
3675       if (c == '\r')
3676         {
3677           if (EQ (eol_type, Qdos))
3678             {
3679               if (src == src_end)
3680                 goto no_more_source;
3681               if (*src == '\n')
3682                 ONE_MORE_BYTE (c);
3683             }
3684           else if (EQ (eol_type, Qmac))
3685             c = '\n';
3686         }
3687       else
3688         {
3689           struct charset *charset;
3690
3691           if (c < 0x80)
3692             charset = charset_roman;
3693           else
3694             {
3695               if (c >= 0xF0)
3696                 goto invalid_code;
3697               if (c < 0xA0 || c >= 0xE0)
3698                 {
3699                   /* SJIS -> JISX0208 */
3700                   ONE_MORE_BYTE (c1);
3701                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3702                     goto invalid_code;
3703                   c = (c << 8) | c1;
3704                   SJIS_TO_JIS (c);
3705                   charset = charset_kanji;
3706                 }
3707               else
3708                 /* SJIS -> JISX0201-Kana */
3709                 charset = charset_kana;
3710             }
3711           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3712         }
3713       *charbuf++ = c;
3714       continue;
3715
3716     invalid_code:
3717       src = src_base;
3718       consumed_chars = consumed_chars_base;
3719       ONE_MORE_BYTE (c);
3720       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3721       coding->errors++;
3722     }
3723
3724  no_more_source:
3725   coding->consumed_char += consumed_chars_base;
3726   coding->consumed = src_base - coding->source;
3727   coding->charbuf_used = charbuf - coding->charbuf;
3728 }
3729
3730 static void
3731 decode_coding_big5 (coding)
3732      struct coding_system *coding;
3733 {
3734   unsigned char *src = coding->source + coding->consumed;
3735   unsigned char *src_end = coding->source + coding->src_bytes;
3736   unsigned char *src_base;
3737   int *charbuf = coding->charbuf;
3738   int *charbuf_end = charbuf + coding->charbuf_size;
3739   int consumed_chars = 0, consumed_chars_base;
3740   int multibytep = coding->src_multibyte;
3741   struct charset *charset_roman, *charset_big5;
3742   Lisp_Object attrs, eol_type, charset_list, val;
3743
3744   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3745   val = charset_list;
3746   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3747   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3748
3749   while (1)
3750     {
3751       int c, c1;
3752
3753       src_base = src;
3754       consumed_chars_base = consumed_chars;
3755
3756       if (charbuf >= charbuf_end)
3757         break;
3758
3759       ONE_MORE_BYTE (c);
3760
3761       if (c == '\r')
3762         {
3763           if (EQ (eol_type, Qdos))
3764             {
3765               if (src == src_end)
3766                 goto no_more_source;
3767               if (*src == '\n')
3768                 ONE_MORE_BYTE (c);
3769             }
3770           else if (EQ (eol_type, Qmac))
3771             c = '\n';
3772         }
3773       else
3774         {
3775           struct charset *charset;
3776           if (c < 0x80)
3777             charset = charset_roman;
3778           else
3779             {
3780               /* BIG5 -> Big5 */
3781               if (c < 0xA1 || c > 0xFE)
3782                 goto invalid_code;
3783               ONE_MORE_BYTE (c1);
3784               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3785                 goto invalid_code;
3786               c = c << 8 | c1;
3787               charset = charset_big5;
3788             }
3789           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3790         }
3791
3792       *charbuf++ = c;
3793       continue;
3794
3795     invalid_code:
3796       src = src_base;
3797       consumed_chars = consumed_chars_base;
3798       ONE_MORE_BYTE (c);
3799       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3800       coding->errors++;
3801     }
3802
3803  no_more_source:
3804   coding->consumed_char += consumed_chars_base;
3805   coding->consumed = src_base - coding->source;
3806   coding->charbuf_used = charbuf - coding->charbuf;
3807 }
3808
3809 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3810    This function can encode charsets `ascii', `katakana-jisx0201',
3811    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3812    are sure that all these charsets are registered as official charset
3813    (i.e. do not have extended leading-codes).  Characters of other
3814    charsets are produced without any encoding.  If SJIS_P is 1, encode
3815    SJIS text, else encode BIG5 text.  */
3816
3817 static int
3818 encode_coding_sjis (coding)
3819      struct coding_system *coding;
3820 {
3821   int multibytep = coding->dst_multibyte;
3822   int *charbuf = coding->charbuf;
3823   int *charbuf_end = charbuf + coding->charbuf_used;
3824   unsigned char *dst = coding->destination + coding->produced;
3825   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3826   int safe_room = 4;
3827   int produced_chars = 0;
3828   Lisp_Object attrs, eol_type, charset_list, val;
3829   int ascii_compatible;
3830   struct charset *charset_roman, *charset_kanji, *charset_kana;
3831   int c;
3832
3833   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3834   val = charset_list;
3835   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3836   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3837   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3838
3839   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3840
3841   while (charbuf < charbuf_end)
3842     {
3843       ASSURE_DESTINATION (safe_room);
3844       c = *charbuf++;
3845       /* Now encode the character C.  */
3846       if (ASCII_CHAR_P (c) && ascii_compatible)
3847         EMIT_ONE_ASCII_BYTE (c);
3848       else
3849         {
3850           unsigned code;
3851           struct charset *charset = char_charset (c, charset_list, &code);
3852
3853           if (!charset)
3854             {
3855               c = coding->default_char;
3856               charset = char_charset (c, charset_list, &code);
3857             }
3858           if (code == CHARSET_INVALID_CODE (charset))
3859             abort ();
3860           if (charset == charset_kanji)
3861             {
3862               int c1, c2;
3863               JIS_TO_SJIS (code);
3864               c1 = code >> 8, c2 = code & 0xFF;
3865               EMIT_TWO_BYTES (c1, c2);
3866             }
3867           else if (charset == charset_kana)
3868             EMIT_ONE_BYTE (code | 0x80);
3869           else
3870             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3871         }
3872     }
3873   coding->result = CODING_RESULT_SUCCESS;
3874   coding->produced_char += produced_chars;
3875   coding->produced = dst - coding->destination;
3876   return 0;
3877 }
3878
3879 static int
3880 encode_coding_big5 (coding)
3881      struct coding_system *coding;
3882 {
3883   int multibytep = coding->dst_multibyte;
3884   int *charbuf = coding->charbuf;
3885   int *charbuf_end = charbuf + coding->charbuf_used;
3886   unsigned char *dst = coding->destination + coding->produced;
3887   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3888   int safe_room = 4;
3889   int produced_chars = 0;
3890   Lisp_Object attrs, eol_type, charset_list, val;
3891   int ascii_compatible;
3892   struct charset *charset_roman, *charset_big5;
3893   int c;
3894
3895   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3896   val = charset_list;
3897   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3898   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3899   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3900
3901   while (charbuf < charbuf_end)
3902     {
3903       ASSURE_DESTINATION (safe_room);
3904       c = *charbuf++;
3905       /* Now encode the character C.  */
3906       if (ASCII_CHAR_P (c) && ascii_compatible)
3907         EMIT_ONE_ASCII_BYTE (c);
3908       else
3909         {
3910           unsigned code;
3911           struct charset *charset = char_charset (c, charset_list, &code);
3912
3913           if (! charset)
3914             {
3915               c = coding->default_char;
3916               charset = char_charset (c, charset_list, &code);
3917             }
3918           if (code == CHARSET_INVALID_CODE (charset))
3919             abort ();
3920           if (charset == charset_big5)
3921             {
3922               int c1, c2;
3923
3924               c1 = code >> 8, c2 = code & 0xFF;
3925               EMIT_TWO_BYTES (c1, c2);
3926             }
3927           else
3928             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3929         }
3930     }
3931   coding->result = CODING_RESULT_SUCCESS;
3932   coding->produced_char += produced_chars;
3933   coding->produced = dst - coding->destination;
3934   return 0;
3935 }
3936
3937 \f
3938 /*** 10. CCL handlers ***/
3939
3940 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3941    Check if a text is encoded in a coding system of which
3942    encoder/decoder are written in CCL program.  If it is, return
3943    CATEGORY_MASK_CCL, else return 0.  */
3944
3945 static int
3946 detect_coding_ccl (coding, mask)
3947      struct coding_system *coding;
3948      int *mask;
3949 {
3950   unsigned char *src = coding->source, *src_base = src;
3951   unsigned char *src_end = coding->source + coding->src_bytes;
3952   int multibytep = coding->src_multibyte;
3953   int consumed_chars = 0;
3954   int found = 0;
3955   unsigned char *valids = CODING_CCL_VALIDS (coding);
3956   int head_ascii = coding->head_ascii;
3957   Lisp_Object attrs;
3958
3959   coding = &coding_categories[coding_category_ccl];
3960   attrs = CODING_ID_ATTRS (coding->id);
3961   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3962     src += head_ascii;
3963
3964   while (1)
3965     {
3966       int c;
3967       ONE_MORE_BYTE (c);
3968       if (! valids[c])
3969         break;
3970       if (!found && valids[c] > 1)
3971         found = 1;
3972     }
3973   *mask &= ~CATEGORY_MASK_CCL;
3974   return 0;
3975
3976  no_more_source:
3977   if (!found)
3978     return 0;
3979   *mask &= CATEGORY_MASK_CCL;
3980   return 1;
3981 }
3982
3983 static void
3984 decode_coding_ccl (coding)
3985      struct coding_system *coding;
3986 {
3987   unsigned char *src = coding->source + coding->consumed;
3988   unsigned char *src_end = coding->source + coding->src_bytes;
3989   int *charbuf = coding->charbuf;
3990   int *charbuf_end = charbuf + coding->charbuf_size;
3991   int consumed_chars = 0;
3992   int multibytep = coding->src_multibyte;
3993   struct ccl_program ccl;
3994   int source_charbuf[1024];
3995   int source_byteidx[1024];
3996
3997   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
3998
3999   while (src < src_end)
4000     {
4001       unsigned char *p = src;
4002       int *source, *source_end;
4003       int i = 0;
4004
4005       if (multibytep)
4006         while (i < 1024 && p < src_end)
4007           {
4008             source_byteidx[i] = p - src;
4009             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4010           }
4011       else
4012         while (i < 1024 && p < src_end)
4013           source_charbuf[i++] = *p++;
4014
4015       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4016         ccl.last_block = 1;
4017
4018       source = source_charbuf;
4019       source_end = source + i;
4020       while (source < source_end)
4021         {
4022           ccl_driver (&ccl, source, charbuf,
4023                       source_end - source, charbuf_end - charbuf);
4024           source += ccl.consumed;
4025           charbuf += ccl.produced;
4026           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4027             break;
4028         }
4029       if (source < source_end)
4030         src += source_byteidx[source - source_charbuf];
4031       else
4032         src = p;
4033       consumed_chars += source - source_charbuf;
4034
4035       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4036           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4037         break;
4038     }
4039
4040   switch (ccl.status)
4041     {
4042     case CCL_STAT_SUSPEND_BY_SRC:
4043       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4044       break;
4045     case CCL_STAT_SUSPEND_BY_DST:
4046       break;
4047     case CCL_STAT_QUIT:
4048     case CCL_STAT_INVALID_CMD:
4049       coding->result = CODING_RESULT_INTERRUPT;
4050       break;
4051     default:
4052       coding->result = CODING_RESULT_SUCCESS;
4053       break;
4054     }
4055   coding->consumed_char += consumed_chars;
4056   coding->consumed = src - coding->source;
4057   coding->charbuf_used = charbuf - coding->charbuf;
4058 }
4059
4060 static int
4061 encode_coding_ccl (coding)
4062      struct coding_system *coding;
4063 {
4064   struct ccl_program ccl;
4065   int multibytep = coding->dst_multibyte;
4066   int *charbuf = coding->charbuf;
4067   int *charbuf_end = charbuf + coding->charbuf_used;
4068   unsigned char *dst = coding->destination + coding->produced;
4069   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4070   unsigned char *adjusted_dst_end = dst_end - 1;
4071   int destination_charbuf[1024];
4072   int i, produced_chars = 0;
4073
4074   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4075
4076   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4077   ccl.dst_multibyte = coding->dst_multibyte;
4078
4079   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4080     {
4081       int dst_bytes = dst_end - dst;
4082       if (dst_bytes > 1024)
4083         dst_bytes = 1024;
4084
4085       ccl_driver (&ccl, charbuf, destination_charbuf,
4086                   charbuf_end - charbuf, dst_bytes);
4087       charbuf += ccl.consumed;
4088       if (multibytep)
4089         for (i = 0; i < ccl.produced; i++)
4090           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4091       else
4092         {
4093           for (i = 0; i < ccl.produced; i++)
4094             *dst++ = destination_charbuf[i] & 0xFF;
4095           produced_chars += ccl.produced;
4096         }
4097     }
4098
4099   switch (ccl.status)
4100     {
4101     case CCL_STAT_SUSPEND_BY_SRC:
4102       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4103       break;
4104     case CCL_STAT_SUSPEND_BY_DST:
4105       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4106       break;
4107     case CCL_STAT_QUIT:
4108     case CCL_STAT_INVALID_CMD:
4109       coding->result = CODING_RESULT_INTERRUPT;
4110       break;
4111     default:
4112       coding->result = CODING_RESULT_SUCCESS;
4113       break;
4114     }
4115
4116   coding->produced_char += produced_chars;
4117   coding->produced = dst - coding->destination;
4118   return 0;
4119 }
4120
4121
4122 \f
4123 /*** 10, 11. no-conversion handlers ***/
4124
4125 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4126
4127 static void
4128 decode_coding_raw_text (coding)
4129      struct coding_system *coding;
4130 {
4131   coding->chars_at_source = 1;
4132   coding->consumed_char = 0;
4133   coding->consumed = 0;
4134   coding->result = CODING_RESULT_SUCCESS;
4135 }
4136
4137 static int
4138 encode_coding_raw_text (coding)
4139      struct coding_system *coding;
4140 {
4141   int multibytep = coding->dst_multibyte;
4142   int *charbuf = coding->charbuf;
4143   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4144   unsigned char *dst = coding->destination + coding->produced;
4145   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4146   int produced_chars = 0;
4147   int c;
4148
4149   if (multibytep)
4150     {
4151       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4152
4153       if (coding->src_multibyte)
4154         while (charbuf < charbuf_end)
4155           {
4156             ASSURE_DESTINATION (safe_room);
4157             c = *charbuf++;
4158             if (ASCII_CHAR_P (c))
4159               EMIT_ONE_ASCII_BYTE (c);
4160             else if (CHAR_BYTE8_P (c))
4161               {
4162                 c = CHAR_TO_BYTE8 (c);
4163                 EMIT_ONE_BYTE (c);
4164               }
4165             else
4166               {
4167                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4168
4169                 CHAR_STRING_ADVANCE (c, p1);
4170                 while (p0 < p1)
4171                   EMIT_ONE_BYTE (*p0);
4172               }
4173           }
4174       else
4175         while (charbuf < charbuf_end)
4176           {
4177             ASSURE_DESTINATION (safe_room);
4178             c = *charbuf++;
4179             EMIT_ONE_BYTE (c);
4180           }
4181     }
4182   else
4183     {
4184       if (coding->src_multibyte)
4185         {
4186           int safe_room = MAX_MULTIBYTE_LENGTH;
4187
4188           while (charbuf < charbuf_end)
4189             {
4190               ASSURE_DESTINATION (safe_room);
4191               c = *charbuf++;
4192               if (ASCII_CHAR_P (c))
4193                 *dst++ = c;
4194               else if (CHAR_BYTE8_P (c))
4195                 *dst++ = CHAR_TO_BYTE8 (c);
4196               else
4197                 CHAR_STRING_ADVANCE (c, dst);
4198               produced_chars++;
4199             }
4200         }
4201       else
4202         {
4203           ASSURE_DESTINATION (charbuf_end - charbuf);
4204           while (charbuf < charbuf_end && dst < dst_end)
4205             *dst++ = *charbuf++;
4206           produced_chars = dst - (coding->destination + coding->dst_bytes);
4207         }
4208     }
4209   coding->result = CODING_RESULT_SUCCESS;
4210   coding->produced_char += produced_chars;
4211   coding->produced = dst - coding->destination;
4212   return 0;
4213 }
4214
4215 static int
4216 detect_coding_charset (coding, mask)
4217      struct coding_system *coding;
4218      int *mask;
4219 {
4220   unsigned char *src = coding->source, *src_base = src;
4221   unsigned char *src_end = coding->source + coding->src_bytes;
4222   int multibytep = coding->src_multibyte;
4223   int consumed_chars = 0;
4224   Lisp_Object attrs, valids;
4225
4226   coding = &coding_categories[coding_category_charset];
4227   attrs = CODING_ID_ATTRS (coding->id);
4228   valids = AREF (attrs, coding_attr_charset_valids);
4229
4230   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4231     src += coding->head_ascii;
4232
4233   while (1)
4234     {
4235       int c;
4236
4237       ONE_MORE_BYTE (c);
4238       if (NILP (AREF (valids, c)))
4239         break;
4240     }
4241   *mask &= ~CATEGORY_MASK_CHARSET;
4242   return 0;
4243
4244  no_more_source:
4245   *mask &= CATEGORY_MASK_CHARSET;
4246   return 1;
4247 }
4248
4249 static void
4250 decode_coding_charset (coding)
4251      struct coding_system *coding;
4252 {
4253   unsigned char *src = coding->source + coding->consumed;
4254   unsigned char *src_end = coding->source + coding->src_bytes;
4255   unsigned char *src_base;
4256   int *charbuf = coding->charbuf;
4257   int *charbuf_end = charbuf + coding->charbuf_size;
4258   int consumed_chars = 0, consumed_chars_base;
4259   int multibytep = coding->src_multibyte;
4260   Lisp_Object attrs, eol_type, charset_list, valids;
4261
4262   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4263   valids = AREF (attrs, coding_attr_charset_valids);
4264
4265   while (1)
4266     {
4267       int c;
4268
4269       src_base = src;
4270       consumed_chars_base = consumed_chars;
4271
4272       if (charbuf >= charbuf_end)
4273         break;
4274
4275       ONE_MORE_BYTE (c);
4276       if (c == '\r')
4277         {
4278           if (EQ (eol_type, Qdos))
4279             {
4280               if (src < src_end
4281                   && *src == '\n')
4282                 ONE_MORE_BYTE (c);
4283             }
4284           else if (EQ (eol_type, Qmac))
4285             c = '\n';
4286         }
4287       else
4288         {
4289           Lisp_Object val;
4290           struct charset *charset;
4291           int c1;
4292
4293           val = AREF (valids, c);
4294           if (NILP (val))
4295             goto invalid_code;
4296           charset = CHARSET_FROM_ID (XFASTINT (val));
4297           if (CHARSET_DIMENSION (charset) > 1)
4298             {
4299               ONE_MORE_BYTE (c1);
4300               c = (c << 8) | c1;
4301               if (CHARSET_DIMENSION (charset) > 2)
4302                 {
4303                   ONE_MORE_BYTE (c1);
4304                   c = (c << 8) | c1;
4305                   if (CHARSET_DIMENSION (charset) > 3)
4306                     {
4307                       ONE_MORE_BYTE (c1);
4308                       c = (c << 8) | c1;
4309                     }
4310                 }
4311             }
4312           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4313           if (c < 0)
4314             goto invalid_code;
4315         }
4316       *charbuf++ = c;
4317       continue;
4318
4319     invalid_code:
4320       src = src_base;
4321       consumed_chars = consumed_chars_base;
4322       ONE_MORE_BYTE (c);
4323       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4324       coding->errors++;
4325     }
4326
4327  no_more_source:
4328   coding->consumed_char += consumed_chars_base;
4329   coding->consumed = src_base - coding->source;
4330   coding->charbuf_used = charbuf - coding->charbuf;
4331 }
4332
4333 static int
4334 encode_coding_charset (coding)
4335      struct coding_system *coding;
4336 {
4337   int multibytep = coding->dst_multibyte;
4338   int *charbuf = coding->charbuf;
4339   int *charbuf_end = charbuf + coding->charbuf_used;
4340   unsigned char *dst = coding->destination + coding->produced;
4341   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4342   int safe_room = MAX_MULTIBYTE_LENGTH;
4343   int produced_chars = 0;
4344   Lisp_Object attrs, eol_type, charset_list;
4345   int ascii_compatible;
4346   int c;
4347
4348   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4349   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4350
4351   while (charbuf < charbuf_end)
4352     {
4353       struct charset *charset;
4354       unsigned code;
4355
4356       ASSURE_DESTINATION (safe_room);
4357       c = *charbuf++;
4358       if (ascii_compatible && ASCII_CHAR_P (c))
4359         EMIT_ONE_ASCII_BYTE (c);
4360       else
4361         {
4362           charset = char_charset (c, charset_list, &code);
4363           if (charset)
4364             {
4365               if (CHARSET_DIMENSION (charset) == 1)
4366                 EMIT_ONE_BYTE (code);
4367               else if (CHARSET_DIMENSION (charset) == 2)
4368                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4369               else if (CHARSET_DIMENSION (charset) == 3)
4370                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4371               else
4372                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4373                                  (code >> 8) & 0xFF, code & 0xFF);
4374             }
4375           else
4376             EMIT_ONE_BYTE (coding->default_char);
4377         }
4378     }
4379
4380   coding->result = CODING_RESULT_SUCCESS;
4381   coding->produced_char += produced_chars;
4382   coding->produced = dst - coding->destination;
4383   return 0;
4384 }
4385
4386 \f
4387 /*** 7. C library functions ***/
4388
4389 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4390    has a property `coding-system'.  The value of this property is a
4391    vector of length 5 (called as coding-vector).  Among elements of
4392    this vector, the first (element[0]) and the fifth (element[4])
4393    carry important information for decoding/encoding.  Before
4394    decoding/encoding, this information should be set in fields of a
4395    structure of type `coding_system'.
4396
4397    A value of property `coding-system' can be a symbol of another
4398    subsidiary coding-system.  In that case, Emacs gets coding-vector
4399    from that symbol.
4400
4401    `element[0]' contains information to be set in `coding->type'.  The
4402    value and its meaning is as follows:
4403
4404    0 -- coding_type_emacs_mule
4405    1 -- coding_type_sjis
4406    2 -- coding_type_iso_2022
4407    3 -- coding_type_big5
4408    4 -- coding_type_ccl encoder/decoder written in CCL
4409    nil -- coding_type_no_conversion
4410    t -- coding_type_undecided (automatic conversion on decoding,
4411                                no-conversion on encoding)
4412
4413    `element[4]' contains information to be set in `coding->flags' and
4414    `coding->spec'.  The meaning varies by `coding->type'.
4415
4416    If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4417    of length 32 (of which the first 13 sub-elements are used now).
4418    Meanings of these sub-elements are:
4419
4420    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4421         If the value is an integer of valid charset, the charset is
4422         assumed to be designated to graphic register N initially.
4423
4424         If the value is minus, it is a minus value of charset which
4425         reserves graphic register N, which means that the charset is
4426         not designated initially but should be designated to graphic
4427         register N just before encoding a character in that charset.
4428
4429         If the value is nil, graphic register N is never used on
4430         encoding.
4431
4432    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4433         Each value takes t or nil.  See the section ISO2022 of
4434         `coding.h' for more information.
4435
4436    If `coding->type' is `coding_type_big5', element[4] is t to denote
4437    BIG5-ETen or nil to denote BIG5-HKU.
4438
4439    If `coding->type' takes the other value, element[4] is ignored.
4440
4441    Emacs Lisp's coding system also carries information about format of
4442    end-of-line in a value of property `eol-type'.  If the value is
4443    integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr.  If
4444    it is not integer, it should be a vector of subsidiary coding
4445    systems of which property `eol-type' has one of above values.
4446
4447 */
4448
4449 /* Setup coding context CODING from information about CODING_SYSTEM.
4450    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4451    CODING_SYSTEM is invalid, signal an error.  */
4452
4453 void
4454 setup_coding_system (coding_system, coding)
4455      Lisp_Object coding_system;
4456      struct coding_system *coding;
4457 {
4458   Lisp_Object attrs;
4459   Lisp_Object eol_type;
4460   Lisp_Object coding_type;
4461   Lisp_Object val;
4462
4463   if (NILP (coding_system))
4464     coding_system = Qno_conversion;
4465
4466   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4467
4468   attrs = CODING_ID_ATTRS (coding->id);
4469   eol_type = CODING_ID_EOL_TYPE (coding->id);
4470
4471   coding->mode = 0;
4472   coding->head_ascii = -1;
4473   coding->common_flags
4474     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4475
4476   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4477   coding->max_charset_id = XSTRING (val)->size - 1;
4478   coding->safe_charsets = (char *) XSTRING (val)->data;
4479   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4480
4481   coding_type = CODING_ATTR_TYPE (attrs);
4482   if (EQ (coding_type, Qundecided))
4483     {
4484       coding->detector = NULL;
4485       coding->decoder = decode_coding_raw_text;
4486       coding->encoder = encode_coding_raw_text;
4487       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4488     }
4489   else if (EQ (coding_type, Qiso_2022))
4490     {
4491       int i;
4492       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4493
4494       /* Invoke graphic register 0 to plane 0.  */
4495       CODING_ISO_INVOCATION (coding, 0) = 0;
4496       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4497       CODING_ISO_INVOCATION (coding, 1)
4498         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4499       /* Setup the initial status of designation.  */
4500       for (i = 0; i < 4; i++)
4501         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4502       /* Not single shifting initially.  */
4503       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4504       /* Beginning of buffer should also be regarded as bol. */
4505       CODING_ISO_BOL (coding) = 1;
4506       coding->detector = detect_coding_iso_2022;
4507       coding->decoder = decode_coding_iso_2022;
4508       coding->encoder = encode_coding_iso_2022;
4509       if (flags & CODING_ISO_FLAG_SAFE)
4510         coding->mode |= CODING_MODE_SAFE_ENCODING;
4511       coding->common_flags
4512         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4513             | CODING_REQUIRE_FLUSHING_MASK);
4514       if (flags & CODING_ISO_FLAG_COMPOSITION)
4515         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4516       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4517         {
4518           setup_iso_safe_charsets (attrs);
4519           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4520           coding->max_charset_id = XSTRING (val)->size - 1;
4521           coding->safe_charsets = (char *) XSTRING (val)->data;
4522         }
4523       CODING_ISO_FLAGS (coding) = flags;
4524     }
4525   else if (EQ (coding_type, Qcharset))
4526     {
4527       coding->detector = detect_coding_charset;
4528       coding->decoder = decode_coding_charset;
4529       coding->encoder = encode_coding_charset;
4530       coding->common_flags
4531         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4532     }
4533   else if (EQ (coding_type, Qutf_8))
4534     {
4535       coding->detector = detect_coding_utf_8;
4536       coding->decoder = decode_coding_utf_8;
4537       coding->encoder = encode_coding_utf_8;
4538       coding->common_flags
4539         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4540     }
4541   else if (EQ (coding_type, Qutf_16))
4542     {
4543       val = AREF (attrs, coding_attr_utf_16_bom);
4544       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4545                                     : EQ (val, Qt) ? utf_16_with_bom
4546                                     : utf_16_without_bom);
4547       val = AREF (attrs, coding_attr_utf_16_endian);
4548       CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4549                                        : utf_16_little_endian);
4550       CODING_UTF_16_SURROGATE (coding) = 0;
4551       coding->detector = detect_coding_utf_16;
4552       coding->decoder = decode_coding_utf_16;
4553       coding->encoder = encode_coding_utf_16;
4554       coding->common_flags
4555         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4556     }
4557   else if (EQ (coding_type, Qccl))
4558     {
4559       coding->detector = detect_coding_ccl;
4560       coding->decoder = decode_coding_ccl;
4561       coding->encoder = encode_coding_ccl;
4562       coding->common_flags
4563         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4564             | CODING_REQUIRE_FLUSHING_MASK);
4565     }
4566   else if (EQ (coding_type, Qemacs_mule))
4567     {
4568       coding->detector = detect_coding_emacs_mule;
4569       coding->decoder = decode_coding_emacs_mule;
4570       coding->encoder = encode_coding_emacs_mule;
4571       coding->common_flags
4572         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4573       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4574           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4575         {
4576           Lisp_Object tail, safe_charsets;
4577           int max_charset_id = 0;
4578
4579           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4580                tail = XCDR (tail))
4581             if (max_charset_id < XFASTINT (XCAR (tail)))
4582               max_charset_id = XFASTINT (XCAR (tail));
4583           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4584                                         make_number (255));
4585           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4586                tail = XCDR (tail))
4587             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4588           coding->max_charset_id = max_charset_id;
4589           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4590         }
4591     }
4592   else if (EQ (coding_type, Qshift_jis))
4593     {
4594       coding->detector = detect_coding_sjis;
4595       coding->decoder = decode_coding_sjis;
4596       coding->encoder = encode_coding_sjis;
4597       coding->common_flags
4598         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4599     }
4600   else if (EQ (coding_type, Qbig5))
4601     {
4602       coding->detector = detect_coding_big5;
4603       coding->decoder = decode_coding_big5;
4604       coding->encoder = encode_coding_big5;
4605       coding->common_flags
4606         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4607     }
4608   else                          /* EQ (coding_type, Qraw_text) */
4609     {
4610       coding->detector = NULL;
4611       coding->decoder = decode_coding_raw_text;
4612       coding->encoder = encode_coding_raw_text;
4613       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4614     }
4615
4616   return;
4617 }
4618
4619 /* Return raw-text or one of its subsidiaries that has the same
4620    eol_type as CODING-SYSTEM.  */
4621
4622 Lisp_Object
4623 raw_text_coding_system (coding_system)
4624      Lisp_Object coding_system;
4625 {
4626   Lisp_Object spec, attrs;
4627   Lisp_Object eol_type, raw_text_eol_type;
4628
4629   spec = CODING_SYSTEM_SPEC (coding_system);
4630   attrs = AREF (spec, 0);
4631
4632   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4633     return coding_system;
4634
4635   eol_type = AREF (spec, 2);
4636   if (VECTORP (eol_type))
4637     return Qraw_text;
4638   spec = CODING_SYSTEM_SPEC (Qraw_text);
4639   raw_text_eol_type = AREF (spec, 2);
4640   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4641           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4642           : AREF (raw_text_eol_type, 2));
4643 }
4644
4645
4646 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4647    does, return one of the subsidiary that has the same eol-spec as
4648    PARENT.  Otherwise, return CODING_SYSTEM.  */
4649
4650 Lisp_Object
4651 coding_inherit_eol_type (coding_system, parent)
4652      Lisp_Object coding_system, parent;
4653 {
4654   Lisp_Object spec, attrs, eol_type;
4655
4656   spec = CODING_SYSTEM_SPEC (coding_system);
4657   attrs = AREF (spec, 0);
4658   eol_type = AREF (spec, 2);
4659   if (VECTORP (eol_type))
4660     {
4661       Lisp_Object parent_spec;
4662       Lisp_Object parent_eol_type;
4663
4664       parent_spec
4665         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4666       parent_eol_type = AREF (parent_spec, 2);
4667       if (EQ (parent_eol_type, Qunix))
4668         coding_system = AREF (eol_type, 0);
4669       else if (EQ (parent_eol_type, Qdos))
4670         coding_system = AREF (eol_type, 1);
4671       else if (EQ (parent_eol_type, Qmac))
4672         coding_system = AREF (eol_type, 2);
4673     }
4674   return coding_system;
4675 }
4676
4677 /* Emacs has a mechanism to automatically detect a coding system if it
4678    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4679    it's impossible to distinguish some coding systems accurately
4680    because they use the same range of codes.  So, at first, coding
4681    systems are categorized into 7, those are:
4682
4683    o coding-category-emacs-mule
4684
4685         The category for a coding system which has the same code range
4686         as Emacs' internal format.  Assigned the coding-system (Lisp
4687         symbol) `emacs-mule' by default.
4688
4689    o coding-category-sjis
4690
4691         The category for a coding system which has the same code range
4692         as SJIS.  Assigned the coding-system (Lisp
4693         symbol) `japanese-shift-jis' by default.
4694
4695    o coding-category-iso-7
4696
4697         The category for a coding system which has the same code range
4698         as ISO2022 of 7-bit environment.  This doesn't use any locking
4699         shift and single shift functions.  This can encode/decode all
4700         charsets.  Assigned the coding-system (Lisp symbol)
4701         `iso-2022-7bit' by default.
4702
4703    o coding-category-iso-7-tight
4704
4705         Same as coding-category-iso-7 except that this can
4706         encode/decode only the specified charsets.
4707
4708    o coding-category-iso-8-1
4709
4710         The category for a coding system which has the same code range
4711         as ISO2022 of 8-bit environment and graphic plane 1 used only
4712         for DIMENSION1 charset.  This doesn't use any locking shift
4713         and single shift functions.  Assigned the coding-system (Lisp
4714         symbol) `iso-latin-1' by default.
4715
4716    o coding-category-iso-8-2
4717
4718         The category for a coding system which has the same code range
4719         as ISO2022 of 8-bit environment and graphic plane 1 used only
4720         for DIMENSION2 charset.  This doesn't use any locking shift
4721         and single shift functions.  Assigned the coding-system (Lisp
4722         symbol) `japanese-iso-8bit' by default.
4723
4724    o coding-category-iso-7-else
4725
4726         The category for a coding system which has the same code range
4727         as ISO2022 of 7-bit environemnt but uses locking shift or
4728         single shift functions.  Assigned the coding-system (Lisp
4729         symbol) `iso-2022-7bit-lock' by default.
4730
4731    o coding-category-iso-8-else
4732
4733         The category for a coding system which has the same code range
4734         as ISO2022 of 8-bit environemnt but uses locking shift or
4735         single shift functions.  Assigned the coding-system (Lisp
4736         symbol) `iso-2022-8bit-ss2' by default.
4737
4738    o coding-category-big5
4739
4740         The category for a coding system which has the same code range
4741         as BIG5.  Assigned the coding-system (Lisp symbol)
4742         `cn-big5' by default.
4743
4744    o coding-category-utf-8
4745
4746         The category for a coding system which has the same code range
4747         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4748         symbol) `utf-8' by default.
4749
4750    o coding-category-utf-16-be
4751
4752         The category for a coding system in which a text has an
4753         Unicode signature (cf. Unicode Standard) in the order of BIG
4754         endian at the head.  Assigned the coding-system (Lisp symbol)
4755         `utf-16-be' by default.
4756
4757    o coding-category-utf-16-le
4758
4759         The category for a coding system in which a text has an
4760         Unicode signature (cf. Unicode Standard) in the order of
4761         LITTLE endian at the head.  Assigned the coding-system (Lisp
4762         symbol) `utf-16-le' by default.
4763
4764    o coding-category-ccl
4765
4766         The category for a coding system of which encoder/decoder is
4767         written in CCL programs.  The default value is nil, i.e., no
4768         coding system is assigned.
4769
4770    o coding-category-binary
4771
4772         The category for a coding system not categorized in any of the
4773         above.  Assigned the coding-system (Lisp symbol)
4774         `no-conversion' by default.
4775
4776    Each of them is a Lisp symbol and the value is an actual
4777    `coding-system's (this is also a Lisp symbol) assigned by a user.
4778    What Emacs does actually is to detect a category of coding system.
4779    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4780    decide only one possible category, it selects a category of the
4781    highest priority.  Priorities of categories are also specified by a
4782    user in a Lisp variable `coding-category-list'.
4783
4784 */
4785
4786 #define EOL_SEEN_NONE   0
4787 #define EOL_SEEN_LF     1
4788 #define EOL_SEEN_CR     2
4789 #define EOL_SEEN_CRLF   4
4790
4791 /* Detect how end-of-line of a text of length CODING->src_bytes
4792    pointed by CODING->source is encoded.  Return one of
4793    EOL_SEEN_XXX.  */
4794
4795 #define MAX_EOL_CHECK_COUNT 3
4796
4797 static int
4798 detect_eol (coding, source, src_bytes)
4799      struct coding_system *coding;
4800      unsigned char *source;
4801      EMACS_INT src_bytes;
4802 {
4803   Lisp_Object attrs, coding_type;
4804   unsigned char *src = source, *src_end = src + src_bytes;
4805   unsigned char c;
4806   int total  = 0;
4807   int eol_seen = EOL_SEEN_NONE;
4808
4809   attrs = CODING_ID_ATTRS (coding->id);
4810   coding_type = CODING_ATTR_TYPE (attrs);
4811
4812   if (EQ (coding_type, Qccl))
4813     {
4814       int msb, lsb;
4815
4816       msb = coding->spec.utf_16.endian == utf_16_little_endian;
4817       lsb = 1 - msb;
4818
4819       while (src + 1 < src_end)
4820         {
4821           c = src[lsb];
4822           if (src[msb] == 0 && (c == '\n' || c == '\r'))
4823             {
4824               int this_eol;
4825
4826               if (c == '\n')
4827                 this_eol = EOL_SEEN_LF;
4828               else if (src + 3 >= src_end
4829                        || src[msb + 2] != 0
4830                        || src[lsb + 2] != '\n')
4831                 this_eol = EOL_SEEN_CR;
4832               else
4833                 this_eol = EOL_SEEN_CRLF;
4834
4835               if (eol_seen == EOL_SEEN_NONE)
4836                 /* This is the first end-of-line.  */
4837                 eol_seen = this_eol;
4838               else if (eol_seen != this_eol)
4839                 {
4840                   /* The found type is different from what found before.  */
4841                   eol_seen = EOL_SEEN_LF;
4842                   break;
4843                 }
4844               if (++total == MAX_EOL_CHECK_COUNT)
4845                 break;
4846             }
4847           src += 2;
4848         }
4849     }
4850   else
4851     {
4852       while (src < src_end)
4853         {
4854           c = *src++;
4855           if (c == '\n' || c == '\r')
4856             {
4857               int this_eol;
4858
4859               if (c == '\n')
4860                 this_eol = EOL_SEEN_LF;
4861               else if (src >= src_end || *src != '\n')
4862                 this_eol = EOL_SEEN_CR;
4863               else
4864                 this_eol = EOL_SEEN_CRLF, src++;
4865
4866               if (eol_seen == EOL_SEEN_NONE)
4867                 /* This is the first end-of-line.  */
4868                 eol_seen = this_eol;
4869               else if (eol_seen != this_eol)
4870                 {
4871                   /* The found type is different from what found before.  */
4872                   eol_seen = EOL_SEEN_LF;
4873                   break;
4874                 }
4875               if (++total == MAX_EOL_CHECK_COUNT)
4876                 break;
4877             }
4878         }
4879     }
4880   return eol_seen;
4881 }
4882
4883
4884 static void
4885 adjust_coding_eol_type (coding, eol_seen)
4886      struct coding_system *coding;
4887      int eol_seen;
4888 {
4889   Lisp_Object eol_type;
4890
4891   eol_type = CODING_ID_EOL_TYPE (coding->id);
4892   if (eol_seen & EOL_SEEN_LF)
4893     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4894   else if (eol_type & EOL_SEEN_CRLF)
4895     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4896   else if (eol_type & EOL_SEEN_CR)
4897     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
4898 }
4899
4900 /* Detect how a text specified in CODING is encoded.  If a coding
4901    system is detected, update fields of CODING by the detected coding
4902    system.  */
4903
4904 void
4905 detect_coding (coding)
4906      struct coding_system *coding;
4907 {
4908   unsigned char *src, *src_end;
4909   Lisp_Object attrs, coding_type;
4910
4911   coding->consumed = coding->consumed_char = 0;
4912   coding->produced = coding->produced_char = 0;
4913   coding_set_source (coding);
4914
4915   src_end = coding->source + coding->src_bytes;
4916
4917   /* If we have not yet decided the text encoding type, detect it
4918      now.  */
4919   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
4920     {
4921       int mask = CATEGORY_MASK_ANY;
4922       int c, i;
4923
4924       for (src = coding->source; src < src_end; src++)
4925         {
4926           c = *src;
4927           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4928                                         || c == ISO_CODE_SI
4929                                         || c == ISO_CODE_SO)))
4930             break;
4931         }
4932       coding->head_ascii = src - (coding->source + coding->consumed);
4933
4934       if (coding->head_ascii < coding->src_bytes)
4935         {
4936           int detected = 0;
4937
4938           for (i = 0; i < coding_category_raw_text; i++)
4939             {
4940               enum coding_category category = coding_priorities[i];
4941               struct coding_system *this = coding_categories + category;
4942
4943               if (category >= coding_category_raw_text
4944                   || detected & (1 << category))
4945                 continue;
4946
4947               if (this->id < 0)
4948                 {
4949                   /* No coding system of this category is defined.  */
4950                   mask &= ~(1 << category);
4951                 }
4952               else
4953                 {
4954                   detected |= detected_mask[category];
4955                   if ((*(this->detector)) (coding, &mask))
4956                     break;
4957                 }
4958             }
4959           if (! mask)
4960             setup_coding_system (Qraw_text, coding);
4961           else if (mask != CATEGORY_MASK_ANY)
4962             for (i = 0; i < coding_category_raw_text; i++)
4963               {
4964                 enum coding_category category = coding_priorities[i];
4965                 struct coding_system *this = coding_categories + category;
4966
4967                 if (mask & (1 << category))
4968                   {
4969                     setup_coding_system (CODING_ID_NAME (this->id), coding);
4970                     break;
4971                   }
4972               }
4973         }
4974     }
4975
4976   attrs = CODING_ID_ATTRS (coding->id);
4977   coding_type = CODING_ATTR_TYPE (attrs);
4978
4979   /* If we have not yet decided the EOL type, detect it now.  But, the
4980      detection is impossible for a CCL based coding system, in which
4981      case, we detct the EOL type after decoding.  */
4982   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
4983       && ! EQ (coding_type, Qccl))
4984     {
4985       int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
4986
4987       if (eol_seen != EOL_SEEN_NONE)
4988         adjust_coding_eol_type (coding, eol_seen);
4989     }
4990 }
4991
4992
4993 static void
4994 decode_eol (coding)
4995      struct coding_system *coding;
4996 {
4997   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
4998     {
4999       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5000       unsigned char *pend = p + coding->produced;
5001       int eol_seen = EOL_SEEN_NONE;
5002
5003       for (; p < pend; p++)
5004         {
5005           if (*p == '\n')
5006             eol_seen |= EOL_SEEN_LF;
5007           else if (*p == '\r')
5008             {
5009               if (p + 1 < pend && *(p + 1) == '\n')
5010                 {
5011                   eol_seen |= EOL_SEEN_CRLF;
5012                   p++;
5013                 }
5014               else
5015                 eol_seen |= EOL_SEEN_CR;
5016             }
5017         }
5018       if (eol_seen != EOL_SEEN_NONE)
5019         adjust_coding_eol_type (coding, eol_seen);
5020     }
5021
5022   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5023     {
5024       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5025       unsigned char *pend = p + coding->produced;
5026
5027       for (; p < pend; p++)
5028         if (*p == '\r')
5029           *p = '\n';
5030     }
5031   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5032     {
5033       unsigned char *p, *pbeg, *pend;
5034       Lisp_Object undo_list;
5035
5036       move_gap_both (coding->dst_pos + coding->produced_char,
5037                      coding->dst_pos_byte + coding->produced);
5038       undo_list = current_buffer->undo_list;
5039       current_buffer->undo_list = Qt;
5040       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5041       current_buffer->undo_list = undo_list;
5042       pbeg = GPT_ADDR;
5043       pend = pbeg + coding->produced;
5044
5045       for (p = pend - 1; p >= pbeg; p--)
5046         if (*p == '\r')
5047           {
5048             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5049             pend--;
5050           }
5051       coding->produced_char -= coding->produced - (pend - pbeg);
5052       coding->produced = pend - pbeg;
5053       insert_from_gap (coding->produced_char, coding->produced);
5054     }
5055 }
5056
5057 static void
5058 translate_chars (coding, table)
5059      struct coding_system *coding;
5060      Lisp_Object table;
5061 {
5062   int *charbuf = coding->charbuf;
5063   int *charbuf_end = charbuf + coding->charbuf_used;
5064   int c;
5065
5066   if (coding->chars_at_source)
5067     return;
5068
5069   while (charbuf < charbuf_end)
5070     {
5071       c = *charbuf;
5072       if (c < 0)
5073         charbuf += c;
5074       else
5075         *charbuf++ = translate_char (table, c);
5076     }
5077 }
5078
5079 static int
5080 produce_chars (coding)
5081      struct coding_system *coding;
5082 {
5083   unsigned char *dst = coding->destination + coding->produced;
5084   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5085   int produced;
5086   int produced_chars = 0;
5087
5088   if (! coding->chars_at_source)
5089     {
5090       /* Characters are in coding->charbuf.  */
5091       int *buf = coding->charbuf;
5092       int *buf_end = buf + coding->charbuf_used;
5093       unsigned char *adjusted_dst_end;
5094
5095       if (BUFFERP (coding->src_object)
5096           && EQ (coding->src_object, coding->dst_object))
5097         dst_end = coding->source + coding->consumed;
5098       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5099
5100       while (buf < buf_end)
5101         {
5102           int c = *buf++;
5103
5104           if (dst >= adjusted_dst_end)
5105             {
5106               dst = alloc_destination (coding,
5107                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5108                                        dst);
5109               dst_end = coding->destination + coding->dst_bytes;
5110               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5111             }
5112           if (c >= 0)
5113             {
5114               if (coding->dst_multibyte
5115                   || ! CHAR_BYTE8_P (c))
5116                 CHAR_STRING_ADVANCE (c, dst);
5117               else
5118                 *dst++ = CHAR_TO_BYTE8 (c);
5119               produced_chars++;
5120             }
5121           else
5122             /* This is an annotation data.  */
5123             buf -= c + 1;
5124         }
5125     }
5126   else
5127     {
5128       unsigned char *src = coding->source;
5129       unsigned char *src_end = src + coding->src_bytes;
5130       Lisp_Object eol_type;
5131
5132       eol_type = CODING_ID_EOL_TYPE (coding->id);
5133
5134       if (coding->src_multibyte != coding->dst_multibyte)
5135         {
5136           if (coding->src_multibyte)
5137             {
5138               int multibytep = 1;
5139               int consumed_chars;
5140
5141               while (1)
5142                 {
5143                   unsigned char *src_base = src;
5144                   int c;
5145
5146                   ONE_MORE_BYTE (c);
5147                   if (c == '\r')
5148                     {
5149                       if (EQ (eol_type, Qdos))
5150                         {
5151                           if (src < src_end
5152                               && *src == '\n')
5153                             c = *src++;
5154                         }
5155                       else if (EQ (eol_type, Qmac))
5156                         c = '\n';
5157                     }
5158                   if (dst == dst_end)
5159                     {
5160                       coding->consumed = src - coding->source;
5161
5162                     if (EQ (coding->src_object, coding->dst_object))
5163                       dst_end = src;
5164                     if (dst == dst_end)
5165                       {
5166                         dst = alloc_destination (coding, src_end - src + 1,
5167                                                  dst);
5168                         dst_end = coding->destination + coding->dst_bytes;
5169                         coding_set_source (coding);
5170                         src = coding->source + coding->consumed;
5171                         src_end = coding->source + coding->src_bytes;
5172                       }
5173                     }
5174                   *dst++ = c;
5175                   produced_chars++;
5176                 }
5177             no_more_source:
5178               ;
5179             }
5180           else
5181             while (src < src_end)
5182               {
5183                 int multibytep = 1;
5184                 int c = *src++;
5185
5186                 if (c == '\r')
5187                   {
5188                     if (EQ (eol_type, Qdos))
5189                       {
5190                         if (src < src_end
5191                             && *src == '\n')
5192                           c = *src++;
5193                       }
5194                     else if (EQ (eol_type, Qmac))
5195                       c = '\n';
5196                   }
5197                 if (dst >= dst_end - 1)
5198                   {
5199                     coding->consumed = src - coding->source;
5200
5201                     if (EQ (coding->src_object, coding->dst_object))
5202                       dst_end = src;
5203                     if (dst >= dst_end - 1)
5204                       {
5205                         dst = alloc_destination (coding, src_end - src + 2,
5206                                                  dst);
5207                         dst_end = coding->destination + coding->dst_bytes;
5208                         coding_set_source (coding);
5209                         src = coding->source + coding->consumed;
5210                         src_end = coding->source + coding->src_bytes;
5211                       }
5212                   }
5213                 EMIT_ONE_BYTE (c);
5214               }
5215         }
5216       else
5217         {
5218           if (!EQ (coding->src_object, coding->dst_object))
5219             {
5220               int require = coding->src_bytes - coding->dst_bytes;
5221
5222               if (require > 0)
5223                 {
5224                   EMACS_INT offset = src - coding->source;
5225
5226                   dst = alloc_destination (coding, require, dst);
5227                   coding_set_source (coding);
5228                   src = coding->source + offset;
5229                   src_end = coding->source + coding->src_bytes;
5230                 }
5231             }
5232           produced_chars = coding->src_chars;
5233           while (src < src_end)
5234             {
5235               int c = *src++;
5236
5237               if (c == '\r')
5238                 {
5239                   if (EQ (eol_type, Qdos))
5240                     {
5241                       if (src < src_end
5242                           && *src == '\n')
5243                         c = *src++;
5244                       produced_chars--;
5245                     }
5246                   else if (EQ (eol_type, Qmac))
5247                     c = '\n';
5248                 }
5249               *dst++ = c;
5250             }
5251         }
5252       coding->consumed = coding->src_bytes;
5253       coding->consumed_char = coding->src_chars;
5254     }
5255
5256   produced = dst - (coding->destination + coding->produced);
5257   if (BUFFERP (coding->dst_object))
5258     insert_from_gap (produced_chars, produced);
5259   coding->produced += produced;
5260   coding->produced_char += produced_chars;
5261   return produced_chars;
5262 }
5263
5264 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5265         or
5266    [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5267  */
5268
5269 static INLINE void
5270 produce_composition (coding, charbuf)
5271      struct coding_system *coding;
5272      int *charbuf;
5273 {
5274   Lisp_Object buffer;
5275   int len;
5276   EMACS_INT pos;
5277   enum composition_method method;
5278   int cmp_len;
5279   Lisp_Object components;
5280
5281   buffer = coding->dst_object;
5282   len = -charbuf[0];
5283   pos = coding->dst_pos + charbuf[1];
5284   method = (enum composition_method) (charbuf[3]);
5285   cmp_len = charbuf[4];
5286
5287   if (method == COMPOSITION_RELATIVE)
5288     components = Qnil;
5289   else
5290     {
5291       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5292       int i;
5293
5294       len -= 5;
5295       charbuf += 5;
5296       for (i = 0; i < len; i++)
5297         args[i] = make_number (charbuf[i]);
5298       components = (method == COMPOSITION_WITH_ALTCHARS
5299                     ? Fstring (len, args) : Fvector (len, args));
5300     }
5301   compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5302 }
5303
5304 static int *
5305 save_composition_data (buf, buf_end, prop)
5306      int *buf, *buf_end;
5307      Lisp_Object prop;
5308 {
5309   enum composition_method method = COMPOSITION_METHOD (prop);
5310   int cmp_len = COMPOSITION_LENGTH (prop);
5311
5312   if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5313     return NULL;
5314
5315   buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5316   buf[2] = method;
5317   buf[3] = cmp_len;
5318
5319   if (method == COMPOSITION_RELATIVE)
5320     buf[0] = 4;
5321   else
5322     {
5323       Lisp_Object components;
5324       int len, i;
5325
5326       components = COMPOSITION_COMPONENTS (prop);
5327       if (VECTORP (components))
5328         {
5329           len = XVECTOR (components)->size;
5330           for (i = 0; i < len; i++)
5331             buf[4 + i] = XINT (AREF (components, i));
5332         }
5333       else if (STRINGP (components))
5334         {
5335           int i_byte;
5336
5337           len = XSTRING (components)->size;
5338           i = i_byte = 0;
5339           while (i < len)
5340             FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5341         }
5342       else if (INTEGERP (components))
5343         {
5344           len = 1;
5345           buf[4] = XINT (components);
5346         }
5347       else if (CONSP (components))
5348         {
5349           for (len = 0; CONSP (components);
5350                len++, components = XCDR (components))
5351             buf[4 + len] = XINT (XCAR (components));
5352         }
5353       else
5354         abort ();
5355       buf[0] = 4 + len;
5356     }
5357   return (buf + buf[0]);
5358 }
5359
5360 #define CHARBUF_SIZE 0x4000
5361
5362 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5363   do {                                                                  \
5364     int size = CHARBUF_SIZE;;                                           \
5365                                                                         \
5366     coding->charbuf = NULL;                                             \
5367     while (size > 1024)                                                 \
5368       {                                                                 \
5369         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5370         if (coding->charbuf)                                            \
5371           break;                                                        \
5372         size >>= 1;                                                     \
5373       }                                                                 \
5374     if (! coding->charbuf)                                              \
5375       {                                                                 \
5376         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5377         return coding->result;                                          \
5378       }                                                                 \
5379     coding->charbuf_size = size;                                        \
5380   } while (0)
5381
5382
5383 static void
5384 produce_annotation (coding)
5385      struct coding_system *coding;
5386 {
5387   int *charbuf = coding->charbuf;
5388   int *charbuf_end = charbuf + coding->charbuf_used;
5389
5390   while (charbuf < charbuf_end)
5391     {
5392       if (*charbuf >= 0)
5393         charbuf++;
5394       else
5395         {
5396           int len = -*charbuf;
5397           switch (charbuf[2])
5398             {
5399             case CODING_ANNOTATE_COMPOSITION_MASK:
5400               produce_composition (coding, charbuf);
5401               break;
5402             default:
5403               abort ();
5404             }
5405           charbuf += len;
5406         }
5407     }
5408 }
5409
5410 /* Decode the data at CODING->src_object into CODING->dst_object.
5411    CODING->src_object is a buffer, a string, or nil.
5412    CODING->dst_object is a buffer.
5413
5414    If CODING->src_object is a buffer, it must be the current buffer.
5415    In this case, if CODING->src_pos is positive, it is a position of
5416    the source text in the buffer, otherwise, the source text is in the
5417    gap area of the buffer, and CODING->src_pos specifies the offset of
5418    the text from GPT (which must be the same as PT).  If this is the
5419    same buffer as CODING->dst_object, CODING->src_pos must be
5420    negative.
5421
5422    If CODING->src_object is a string, CODING->src_pos in an index to
5423    that string.
5424
5425    If CODING->src_object is nil, CODING->source must already point to
5426    the non-relocatable memory area.  In this case, CODING->src_pos is
5427    an offset from CODING->source.
5428
5429    The decoded data is inserted at the current point of the buffer
5430    CODING->dst_object.
5431 */
5432
5433 static int
5434 decode_coding (coding)
5435      struct coding_system *coding;
5436 {
5437   Lisp_Object attrs;
5438
5439   if (BUFFERP (coding->src_object)
5440       && coding->src_pos > 0
5441       && coding->src_pos < GPT
5442       && coding->src_pos + coding->src_chars > GPT)
5443     move_gap_both (coding->src_pos, coding->src_pos_byte);
5444
5445   if (BUFFERP (coding->dst_object))
5446     {
5447       if (current_buffer != XBUFFER (coding->dst_object))
5448         set_buffer_internal (XBUFFER (coding->dst_object));
5449       if (GPT != PT)
5450         move_gap_both (PT, PT_BYTE);
5451     }
5452
5453   coding->consumed = coding->consumed_char = 0;
5454   coding->produced = coding->produced_char = 0;
5455   coding->chars_at_source = 0;
5456   coding->result = CODING_RESULT_SUCCESS;
5457   coding->errors = 0;
5458
5459   ALLOC_CONVERSION_WORK_AREA (coding);
5460
5461   attrs = CODING_ID_ATTRS (coding->id);
5462
5463   do
5464     {
5465       coding_set_source (coding);
5466       coding->annotated = 0;
5467       (*(coding->decoder)) (coding);
5468       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5469         translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5470       coding_set_destination (coding);
5471       produce_chars (coding);
5472       if (coding->annotated)
5473         produce_annotation (coding);
5474     }
5475   while (coding->consumed < coding->src_bytes
5476          && ! coding->result);
5477
5478   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5479       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5480       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5481     decode_eol (coding);
5482
5483   coding->carryover_bytes = 0;
5484   if (coding->consumed < coding->src_bytes)
5485     {
5486       int nbytes = coding->src_bytes - coding->consumed;
5487       unsigned char *src;
5488
5489       coding_set_source (coding);
5490       coding_set_destination (coding);
5491       src = coding->source + coding->consumed;
5492
5493       if (coding->mode & CODING_MODE_LAST_BLOCK)
5494         {
5495           /* Flush out unprocessed data as binary chars.  We are sure
5496              that the number of data is less than the size of
5497              coding->charbuf.  */
5498           int *charbuf = coding->charbuf;
5499
5500           while (nbytes-- > 0)
5501             {
5502               int c = *src++;
5503               *charbuf++ =  (c & 0x80 ? - c : c);
5504             }
5505           produce_chars (coding);
5506         }
5507       else
5508         {
5509           /* Record unprocessed bytes in coding->carryover.  We are
5510              sure that the number of data is less than the size of
5511              coding->carryover.  */
5512           unsigned char *p = coding->carryover;
5513
5514           coding->carryover_bytes = nbytes;
5515           while (nbytes-- > 0)
5516             *p++ = *src++;
5517         }
5518       coding->consumed = coding->src_bytes;
5519     }
5520
5521   return coding->result;
5522 }
5523
5524 static void
5525 consume_chars (coding)
5526      struct coding_system *coding;
5527 {
5528   int *buf = coding->charbuf;
5529   /* -1 is to compensate for CRLF.  */
5530   int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5531   unsigned char *src = coding->source + coding->consumed;
5532   int pos = coding->src_pos + coding->consumed_char;
5533   int end_pos = coding->src_pos + coding->src_chars;
5534   int multibytep = coding->src_multibyte;
5535   Lisp_Object eol_type;
5536   int c;
5537   int start, end, stop;
5538   Lisp_Object object, prop;
5539
5540   eol_type = CODING_ID_EOL_TYPE (coding->id);
5541   if (VECTORP (eol_type))
5542     eol_type = Qunix;
5543
5544   object = coding->src_object;
5545
5546   /* Note: composition handling is not yet implemented.  */
5547   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5548
5549   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5550       && find_composition (pos, end_pos, &start, &end, &prop, object)
5551       && end <= end_pos
5552       && (start >= pos
5553           || (find_composition (end, end_pos, &start, &end, &prop, object)
5554               && end <= end_pos)))
5555     stop = start;
5556   else
5557     stop = end_pos;
5558
5559   while (buf < buf_end)
5560     {
5561       if (pos == stop)
5562         {
5563           int *p;
5564
5565           if (pos == end_pos)
5566             break;
5567           p = save_composition_data (buf, buf_end, prop);
5568           if (p == NULL)
5569             break;
5570           buf = p;
5571           if (find_composition (end, end_pos, &start, &end, &prop, object)
5572               && end <= end_pos)
5573             stop = start;
5574           else
5575             stop = end_pos;
5576         }
5577
5578       if (! multibytep)
5579         c = *src++;
5580       else
5581         c = STRING_CHAR_ADVANCE (src);
5582       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5583         c = '\n';
5584       if (! EQ (eol_type, Qunix))
5585         {
5586           if (c == '\n')
5587             {
5588               if (EQ (eol_type, Qdos))
5589                 *buf++ = '\r';
5590               else
5591                 c = '\r';
5592             }
5593         }
5594       *buf++ = c;
5595       pos++;
5596     }
5597
5598   coding->consumed = src - coding->source;
5599   coding->consumed_char = pos - coding->src_pos;
5600   coding->charbuf_used = buf - coding->charbuf;
5601   coding->chars_at_source = 0;
5602 }
5603
5604
5605 /* Encode the text at CODING->src_object into CODING->dst_object.
5606    CODING->src_object is a buffer or a string.
5607    CODING->dst_object is a buffer or nil.
5608
5609    If CODING->src_object is a buffer, it must be the current buffer.
5610    In this case, if CODING->src_pos is positive, it is a position of
5611    the source text in the buffer, otherwise. the source text is in the
5612    gap area of the buffer, and coding->src_pos specifies the offset of
5613    the text from GPT (which must be the same as PT).  If this is the
5614    same buffer as CODING->dst_object, CODING->src_pos must be
5615    negative and CODING should not have `pre-write-conversion'.
5616
5617    If CODING->src_object is a string, CODING should not have
5618    `pre-write-conversion'.
5619
5620    If CODING->dst_object is a buffer, the encoded data is inserted at
5621    the current point of that buffer.
5622
5623    If CODING->dst_object is nil, the encoded data is placed at the
5624    memory area specified by CODING->destination.  */
5625
5626 static int
5627 encode_coding (coding)
5628      struct coding_system *coding;
5629 {
5630   Lisp_Object attrs;
5631
5632   attrs = CODING_ID_ATTRS (coding->id);
5633
5634   if (BUFFERP (coding->dst_object))
5635     {
5636       set_buffer_internal (XBUFFER (coding->dst_object));
5637       coding->dst_multibyte
5638         = ! NILP (current_buffer->enable_multibyte_characters);
5639     }
5640
5641   coding->consumed = coding->consumed_char = 0;
5642   coding->produced = coding->produced_char = 0;
5643   coding->result = CODING_RESULT_SUCCESS;
5644   coding->errors = 0;
5645
5646   ALLOC_CONVERSION_WORK_AREA (coding);
5647
5648   do {
5649     coding_set_source (coding);
5650     consume_chars (coding);
5651
5652     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5653       translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5654
5655     coding_set_destination (coding);
5656     (*(coding->encoder)) (coding);
5657   } while (coding->consumed_char < coding->src_chars);
5658
5659   if (BUFFERP (coding->dst_object))
5660     insert_from_gap (coding->produced_char, coding->produced);
5661
5662   return (coding->result);
5663 }
5664
5665 /* Work buffer */
5666
5667 /* List of currently used working buffer.  */
5668 Lisp_Object Vcode_conversion_work_buf_list;
5669
5670 /* A working buffer used by the top level conversion.  */
5671 Lisp_Object Vcode_conversion_reused_work_buf;
5672
5673
5674 /* Return a working buffer that can be freely used by the following
5675    code conversion.  MULTIBYTEP specifies the multibyteness of the
5676    buffer.  */
5677
5678 Lisp_Object
5679 make_conversion_work_buffer (multibytep)
5680      int multibytep;
5681 {
5682   struct buffer *current = current_buffer;
5683   Lisp_Object buf;
5684
5685   if (NILP (Vcode_conversion_work_buf_list))
5686     {
5687       if (NILP (Vcode_conversion_reused_work_buf))
5688         Vcode_conversion_reused_work_buf
5689           = Fget_buffer_create (build_string (" *code-conversion-work*"));
5690       Vcode_conversion_work_buf_list
5691         = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5692     }
5693   else
5694     {
5695       int depth = Flength (Vcode_conversion_work_buf_list);
5696       char str[128];
5697
5698       sprintf (str, " *code-conversion-work*<%d>", depth);
5699       Vcode_conversion_work_buf_list
5700         = Fcons (Fget_buffer_create (build_string (str)),
5701                  Vcode_conversion_work_buf_list);
5702     }
5703
5704   buf = XCAR (Vcode_conversion_work_buf_list);
5705   set_buffer_internal (XBUFFER (buf));
5706   current_buffer->undo_list = Qt;
5707   Ferase_buffer ();
5708   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5709   set_buffer_internal (current);
5710   return buf;
5711 }
5712
5713 static struct coding_system *saved_coding;
5714
5715 Lisp_Object
5716 code_conversion_restore (info)
5717      Lisp_Object info;
5718 {
5719   int depth = Flength (Vcode_conversion_work_buf_list);
5720   Lisp_Object buf;
5721
5722   if (depth > 0)
5723     {
5724       buf = XCAR (Vcode_conversion_work_buf_list);
5725       Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5726       if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5727         Fkill_buffer (buf);
5728     }
5729
5730   if (saved_coding->dst_object == Qt
5731       && saved_coding->destination)
5732     xfree (saved_coding->destination);
5733
5734   return save_excursion_restore (info);
5735 }
5736
5737
5738 int
5739 decode_coding_gap (coding, chars, bytes)
5740      struct coding_system *coding;
5741      EMACS_INT chars, bytes;
5742 {
5743   int count = specpdl_ptr - specpdl;
5744
5745   saved_coding = coding;
5746   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5747
5748   coding->src_object = Fcurrent_buffer ();
5749   coding->src_chars = chars;
5750   coding->src_bytes = bytes;
5751   coding->src_pos = -chars;
5752   coding->src_pos_byte = -bytes;
5753   coding->src_multibyte = chars < bytes;
5754   coding->dst_object = coding->src_object;
5755   coding->dst_pos = PT;
5756   coding->dst_pos_byte = PT_BYTE;
5757   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5758
5759   if (CODING_REQUIRE_DETECTION (coding))
5760     detect_coding (coding);
5761
5762   decode_coding (coding);
5763
5764   unbind_to (count, Qnil);
5765   return coding->result;
5766 }
5767
5768 int
5769 encode_coding_gap (coding, chars, bytes)
5770      struct coding_system *coding;
5771      EMACS_INT chars, bytes;
5772 {
5773   int count = specpdl_ptr - specpdl;
5774   Lisp_Object buffer;
5775
5776   saved_coding = coding;
5777   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5778
5779   buffer = Fcurrent_buffer ();
5780   coding->src_object = buffer;
5781   coding->src_chars = chars;
5782   coding->src_bytes = bytes;
5783   coding->src_pos = -chars;
5784   coding->src_pos_byte = -bytes;
5785   coding->src_multibyte = chars < bytes;
5786   coding->dst_object = coding->src_object;
5787   coding->dst_pos = PT;
5788   coding->dst_pos_byte = PT_BYTE;
5789
5790   encode_coding (coding);
5791
5792   unbind_to (count, Qnil);
5793   return coding->result;
5794 }
5795
5796
5797 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5798    SRC_OBJECT into DST_OBJECT by coding context CODING.
5799
5800    SRC_OBJECT is a buffer, a string, or Qnil.
5801
5802    If it is a buffer, the text is at point of the buffer.  FROM and TO
5803    are positions in the buffer.
5804
5805    If it is a string, the text is at the beginning of the string.
5806    FROM and TO are indices to the string.
5807
5808    If it is nil, the text is at coding->source.  FROM and TO are
5809    indices to coding->source.
5810
5811    DST_OBJECT is a buffer, Qt, or Qnil.
5812
5813    If it is a buffer, the decoded text is inserted at point of the
5814    buffer.  If the buffer is the same as SRC_OBJECT, the source text
5815    is deleted.
5816
5817    If it is Qt, a string is made from the decoded text, and
5818    set in CODING->dst_object.
5819
5820    If it is Qnil, the decoded text is stored at CODING->destination.
5821    The called must allocate CODING->dst_bytes bytes at
5822    CODING->destination by xmalloc.  If the decoded text is longer than
5823    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5824  */
5825
5826 void
5827 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5828                       dst_object)
5829      struct coding_system *coding;
5830      Lisp_Object src_object;
5831      EMACS_INT from, from_byte, to, to_byte;
5832      Lisp_Object dst_object;
5833 {
5834   int count = specpdl_ptr - specpdl;
5835   unsigned char *destination;
5836   EMACS_INT dst_bytes;
5837   EMACS_INT chars = to - from;
5838   EMACS_INT bytes = to_byte - from_byte;
5839   Lisp_Object attrs;
5840
5841   saved_coding = coding;
5842   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5843
5844   if (NILP (dst_object))
5845     {
5846       destination = coding->destination;
5847       dst_bytes = coding->dst_bytes;
5848     }
5849
5850   coding->src_object = src_object;
5851   coding->src_chars = chars;
5852   coding->src_bytes = bytes;
5853   coding->src_multibyte = chars < bytes;
5854
5855   if (STRINGP (src_object))
5856     {
5857       coding->src_pos = from;
5858       coding->src_pos_byte = from_byte;
5859     }
5860   else if (BUFFERP (src_object))
5861     {
5862       set_buffer_internal (XBUFFER (src_object));
5863       if (from != GPT)
5864         move_gap_both (from, from_byte);
5865       if (EQ (src_object, dst_object))
5866         {
5867           TEMP_SET_PT_BOTH (from, from_byte);
5868           del_range_both (from, from_byte, to, to_byte, 1);
5869           coding->src_pos = -chars;
5870           coding->src_pos_byte = -bytes;
5871         }
5872       else
5873         {
5874           coding->src_pos = from;
5875           coding->src_pos_byte = from_byte;
5876         }
5877     }
5878
5879   if (CODING_REQUIRE_DETECTION (coding))
5880     detect_coding (coding);
5881   attrs = CODING_ID_ATTRS (coding->id);
5882
5883   if (! NILP (CODING_ATTR_POST_READ (attrs))
5884       || EQ (dst_object, Qt))
5885     {
5886       coding->dst_object = make_conversion_work_buffer (1);
5887       coding->dst_pos = BEG;
5888       coding->dst_pos_byte = BEG_BYTE;
5889       coding->dst_multibyte = 1;
5890     }
5891   else if (BUFFERP (dst_object))
5892     {
5893       coding->dst_object = dst_object;
5894       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5895       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5896       coding->dst_multibyte
5897         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5898     }
5899   else
5900     {
5901       coding->dst_object = Qnil;
5902       coding->dst_multibyte = 1;
5903     }
5904
5905   decode_coding (coding);
5906
5907   if (BUFFERP (coding->dst_object))
5908     set_buffer_internal (XBUFFER (coding->dst_object));
5909
5910   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5911     {
5912       struct gcpro gcpro1, gcpro2;
5913       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5914       Lisp_Object val;
5915
5916       GCPRO2 (coding->src_object, coding->dst_object);
5917       val = call1 (CODING_ATTR_POST_READ (attrs),
5918                    make_number (coding->produced_char));
5919       UNGCPRO;
5920       CHECK_NATNUM (val);
5921       coding->produced_char += Z - prev_Z;
5922       coding->produced += Z_BYTE - prev_Z_BYTE;
5923     }
5924
5925   if (EQ (dst_object, Qt))
5926     {
5927       coding->dst_object = Fbuffer_string ();
5928     }
5929   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5930     {
5931       set_buffer_internal (XBUFFER (coding->dst_object));
5932       if (dst_bytes < coding->produced)
5933         {
5934           destination
5935             = (unsigned char *) xrealloc (destination, coding->produced);
5936           if (! destination)
5937             {
5938               coding->result = CODING_RESULT_INSUFFICIENT_DST;
5939               unbind_to (count, Qnil);
5940               return;
5941             }
5942           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5943             move_gap_both (BEGV, BEGV_BYTE);
5944           bcopy (BEGV_ADDR, destination, coding->produced);
5945           coding->destination = destination;
5946         }
5947     }
5948
5949   unbind_to (count, Qnil);
5950 }
5951
5952
5953 void
5954 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5955                       dst_object)
5956      struct coding_system *coding;
5957      Lisp_Object src_object;
5958      EMACS_INT from, from_byte, to, to_byte;
5959      Lisp_Object dst_object;
5960 {
5961   int count = specpdl_ptr - specpdl;
5962   EMACS_INT chars = to - from;
5963   EMACS_INT bytes = to_byte - from_byte;
5964   Lisp_Object attrs;
5965
5966   saved_coding = coding;
5967   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5968
5969   coding->src_object = src_object;
5970   coding->src_chars = chars;
5971   coding->src_bytes = bytes;
5972   coding->src_multibyte = chars < bytes;
5973
5974   attrs = CODING_ID_ATTRS (coding->id);
5975
5976   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5977     {
5978       Lisp_Object val;
5979
5980       coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
5981       set_buffer_internal (XBUFFER (coding->src_object));
5982       if (STRINGP (src_object))
5983         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
5984       else if (BUFFERP (src_object))
5985         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
5986       else
5987         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
5988
5989       if (EQ (src_object, dst_object))
5990         {
5991           set_buffer_internal (XBUFFER (src_object));
5992           del_range_both (from, from_byte, to, to_byte, 1);
5993           set_buffer_internal (XBUFFER (coding->src_object));
5994         }
5995
5996       val = call2 (CODING_ATTR_PRE_WRITE (attrs),
5997                    make_number (1), make_number (chars));
5998       CHECK_NATNUM (val);
5999       if (BEG != GPT)
6000         move_gap_both (BEG, BEG_BYTE);
6001       coding->src_chars = Z - BEG;
6002       coding->src_bytes = Z_BYTE - BEG_BYTE;
6003       coding->src_pos = BEG;
6004       coding->src_pos_byte = BEG_BYTE;
6005       coding->src_multibyte = Z < Z_BYTE;
6006     }
6007   else if (STRINGP (src_object))
6008     {
6009       coding->src_pos = from;
6010       coding->src_pos_byte = from_byte;
6011     }
6012   else if (BUFFERP (src_object))
6013     {
6014       set_buffer_internal (XBUFFER (src_object));
6015       if (from != GPT)
6016         move_gap_both (from, from_byte);
6017       if (EQ (src_object, dst_object))
6018         {
6019           del_range_both (from, from_byte, to, to_byte, 1);
6020           coding->src_pos = -chars;
6021           coding->src_pos_byte = -bytes;
6022         }
6023       else
6024         {
6025           coding->src_pos = from;
6026           coding->src_pos_byte = from_byte;
6027         }
6028     }
6029
6030   if (BUFFERP (dst_object))
6031     {
6032       coding->dst_object = dst_object;
6033       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6034       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6035       coding->dst_multibyte
6036         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6037     }
6038   else if (EQ (dst_object, Qt))
6039     {
6040       coding->dst_object = Qnil;
6041       coding->destination = (unsigned char *) xmalloc (coding->src_chars);
6042       coding->dst_bytes = coding->src_chars;
6043       coding->dst_multibyte = 0;
6044     }
6045   else
6046     {
6047       coding->dst_object = Qnil;
6048       coding->dst_multibyte = 0;
6049     }
6050
6051   encode_coding (coding);
6052
6053   if (EQ (dst_object, Qt))
6054     {
6055       if (BUFFERP (coding->dst_object))
6056         coding->dst_object = Fbuffer_string ();
6057       else
6058         {
6059           coding->dst_object
6060             = make_unibyte_string ((char *) coding->destination,
6061                                    coding->produced);
6062           xfree (coding->destination);
6063         }
6064     }
6065
6066   unbind_to (count, Qnil);
6067 }
6068
6069
6070 Lisp_Object
6071 preferred_coding_system ()
6072 {
6073   int id = coding_categories[coding_priorities[0]].id;
6074
6075   return CODING_ID_NAME (id);
6076 }
6077
6078 \f
6079 #ifdef emacs
6080 /*** 8. Emacs Lisp library functions ***/
6081
6082 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6083        doc: /* Return t if OBJECT is nil or a coding-system.
6084 See the documentation of `define-coding-system' for information
6085 about coding-system objects.  */)
6086      (obj)
6087      Lisp_Object obj;
6088 {
6089   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6090 }
6091
6092 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6093        Sread_non_nil_coding_system, 1, 1, 0,
6094        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6095      (prompt)
6096      Lisp_Object prompt;
6097 {
6098   Lisp_Object val;
6099   do
6100     {
6101       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6102                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6103     }
6104   while (XSTRING (val)->size == 0);
6105   return (Fintern (val, Qnil));
6106 }
6107
6108 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6109        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6110 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6111      (prompt, default_coding_system)
6112      Lisp_Object prompt, default_coding_system;
6113 {
6114   Lisp_Object val;
6115   if (SYMBOLP (default_coding_system))
6116     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6117   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6118                           Qt, Qnil, Qcoding_system_history,
6119                           default_coding_system, Qnil);
6120   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6121 }
6122
6123 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6124        1, 1, 0,
6125        doc: /* Check validity of CODING-SYSTEM.
6126 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6127 It is valid if it is a symbol with a non-nil `coding-system' property.
6128 The value of property should be a vector of length 5.  */)
6129   (coding_system)
6130      Lisp_Object coding_system;
6131 {
6132   CHECK_SYMBOL (coding_system);
6133   if (!NILP (Fcoding_system_p (coding_system)))
6134     return coding_system;
6135   while (1)
6136     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6137 }
6138
6139 \f
6140 Lisp_Object
6141 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6142      unsigned char *src;
6143      int src_bytes, highest;
6144      int multibytep;
6145      Lisp_Object coding_system;
6146 {
6147   unsigned char *src_end = src + src_bytes;
6148   int mask = CATEGORY_MASK_ANY;
6149   int detected = 0;
6150   int c, i;
6151   Lisp_Object attrs, eol_type;
6152   Lisp_Object val;
6153   struct coding_system coding;
6154
6155   if (NILP (coding_system))
6156     coding_system = Qundecided;
6157   setup_coding_system (coding_system, &coding);
6158   attrs = CODING_ID_ATTRS (coding.id);
6159   eol_type = CODING_ID_EOL_TYPE (coding.id);
6160
6161   coding.source = src;
6162   coding.src_bytes = src_bytes;
6163   coding.src_multibyte = multibytep;
6164   coding.consumed = 0;
6165
6166   if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6167     {
6168       mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6169     }
6170   else
6171     {
6172       coding_system = Qnil;
6173       for (; src < src_end; src++)
6174         {
6175           c = *src;
6176           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6177                                         || c == ISO_CODE_SI
6178                                         || c == ISO_CODE_SO)))
6179             break;
6180         }
6181       coding.head_ascii = src - coding.source;
6182
6183       if (src < src_end)
6184         for (i = 0; i < coding_category_raw_text; i++)
6185           {
6186             enum coding_category category = coding_priorities[i];
6187             struct coding_system *this = coding_categories + category;
6188
6189             if (category >= coding_category_raw_text
6190                 || detected & (1 << category))
6191               continue;
6192
6193             if (this->id < 0)
6194               {
6195                 /* No coding system of this category is defined.  */
6196                 mask &= ~(1 << category);
6197               }
6198             else
6199               {
6200                 detected |= detected_mask[category];
6201                 if ((*(coding_categories[category].detector)) (&coding, &mask)
6202                     && highest)
6203                   {
6204                     mask &= detected_mask[category];
6205                     break;
6206                   }
6207               }
6208           }
6209     }
6210
6211   if (!mask)
6212     val = Fcons (make_number (coding_category_raw_text), Qnil);
6213   else if (mask == CATEGORY_MASK_ANY)
6214     val = Fcons (make_number (coding_category_undecided), Qnil);
6215   else if (highest)
6216     {
6217       for (i = 0; i < coding_category_raw_text; i++)
6218         if (mask & (1 << coding_priorities[i]))
6219           {
6220             val = Fcons (make_number (coding_priorities[i]), Qnil);
6221             break;
6222           }
6223     }
6224   else
6225     {
6226       val = Qnil;
6227       for (i = coding_category_raw_text - 1; i >= 0; i--)
6228         if (mask & (1 << coding_priorities[i]))
6229           val = Fcons (make_number (coding_priorities[i]), val);
6230     }
6231
6232   {
6233     int one_byte_eol = -1, two_byte_eol = -1;
6234     Lisp_Object tail;
6235
6236     for (tail = val; CONSP (tail); tail = XCDR (tail))
6237       {
6238         struct coding_system *this
6239           = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6240              : &coding);
6241         int this_eol;
6242
6243         attrs = CODING_ID_ATTRS (this->id);
6244         eol_type = CODING_ID_EOL_TYPE (this->id);
6245         XSETCAR (tail, CODING_ID_NAME (this->id));
6246         if (VECTORP (eol_type))
6247           {
6248             if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6249               {
6250                 if (two_byte_eol < 0)
6251                   two_byte_eol = detect_eol (this, coding.source, src_bytes);
6252                 this_eol = two_byte_eol;
6253               }
6254             else
6255               {
6256                 if (one_byte_eol < 0)
6257                   one_byte_eol =detect_eol (this, coding.source, src_bytes);
6258                 this_eol = one_byte_eol;
6259               }
6260             if (this_eol == EOL_SEEN_LF)
6261               XSETCAR (tail, AREF (eol_type, 0));
6262             else if (this_eol == EOL_SEEN_CRLF)
6263               XSETCAR (tail, AREF (eol_type, 1));
6264             else if (this_eol == EOL_SEEN_CR)
6265               XSETCAR (tail, AREF (eol_type, 2));
6266           }
6267       }
6268   }
6269
6270   return (highest ? XCAR (val) : val);
6271 }
6272
6273
6274 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6275        2, 3, 0,
6276        doc: /* Detect coding system of the text in the region between START and END.
6277 Return a list of possible coding systems ordered by priority.
6278
6279 If only ASCII characters are found, it returns a list of single element
6280 `undecided' or its subsidiary coding system according to a detected
6281 end-of-line format.
6282
6283 If optional argument HIGHEST is non-nil, return the coding system of
6284 highest priority.  */)
6285      (start, end, highest)
6286      Lisp_Object start, end, highest;
6287 {
6288   int from, to;
6289   int from_byte, to_byte;
6290
6291   CHECK_NUMBER_COERCE_MARKER (start);
6292   CHECK_NUMBER_COERCE_MARKER (end);
6293
6294   validate_region (&start, &end);
6295   from = XINT (start), to = XINT (end);
6296   from_byte = CHAR_TO_BYTE (from);
6297   to_byte = CHAR_TO_BYTE (to);
6298
6299   if (from < GPT && to >= GPT)
6300     move_gap_both (to, to_byte);
6301
6302   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6303                                to_byte - from_byte,
6304                                !NILP (highest),
6305                                !NILP (current_buffer
6306                                       ->enable_multibyte_characters),
6307                                Qnil);
6308 }
6309
6310 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6311        1, 2, 0,
6312        doc: /* Detect coding system of the text in STRING.
6313 Return a list of possible coding systems ordered by priority.
6314
6315 If only ASCII characters are found, it returns a list of single element
6316 `undecided' or its subsidiary coding system according to a detected
6317 end-of-line format.
6318
6319 If optional argument HIGHEST is non-nil, return the coding system of
6320 highest priority.  */)
6321      (string, highest)
6322      Lisp_Object string, highest;
6323 {
6324   CHECK_STRING (string);
6325
6326   return detect_coding_system (XSTRING (string)->data,
6327                                STRING_BYTES (XSTRING (string)),
6328                                !NILP (highest),
6329                                STRING_MULTIBYTE (string),
6330                                Qnil);
6331 }
6332
6333
6334 static INLINE int
6335 char_encodable_p (c, attrs)
6336      int c;
6337      Lisp_Object attrs;
6338 {
6339   Lisp_Object tail;
6340   struct charset *charset;
6341
6342   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6343        CONSP (tail); tail = XCDR (tail))
6344     {
6345       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6346       if (CHAR_CHARSET_P (c, charset))
6347         break;
6348     }
6349   return (! NILP (tail));
6350 }
6351
6352
6353 /* Return a list of coding systems that safely encode the text between
6354    START and END.  If EXCLUDE is non-nil, it is a list of coding
6355    systems not to check.  The returned list doesn't contain any such
6356    coding systems.  In any case, If the text contains only ASCII or is
6357    unibyte, return t.  */
6358
6359 DEFUN ("find-coding-systems-region-internal",
6360        Ffind_coding_systems_region_internal,
6361        Sfind_coding_systems_region_internal, 2, 3, 0,
6362        doc: /* Internal use only.  */)
6363      (start, end, exclude)
6364      Lisp_Object start, end, exclude;
6365 {
6366   Lisp_Object coding_attrs_list, safe_codings;
6367   EMACS_INT start_byte, end_byte;
6368   unsigned char *p, *pbeg, *pend;
6369   int c;
6370   Lisp_Object tail, elt;
6371
6372   if (STRINGP (start))
6373     {
6374       if (!STRING_MULTIBYTE (start)
6375           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6376         return Qt;
6377       start_byte = 0;
6378       end_byte = STRING_BYTES (XSTRING (start));
6379     }
6380   else
6381     {
6382       CHECK_NUMBER_COERCE_MARKER (start);
6383       CHECK_NUMBER_COERCE_MARKER (end);
6384       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6385         args_out_of_range (start, end);
6386       if (NILP (current_buffer->enable_multibyte_characters))
6387         return Qt;
6388       start_byte = CHAR_TO_BYTE (XINT (start));
6389       end_byte = CHAR_TO_BYTE (XINT (end));
6390       if (XINT (end) - XINT (start) == end_byte - start_byte)
6391         return Qt;
6392
6393       if (start < GPT && end > GPT)
6394         {
6395           if ((GPT - start) < (end - GPT))
6396             move_gap_both (start, start_byte);
6397           else
6398             move_gap_both (end, end_byte);
6399         }
6400     }
6401
6402   coding_attrs_list = Qnil;
6403   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6404     if (NILP (exclude)
6405         || NILP (Fmemq (XCAR (tail), exclude)))
6406       {
6407         Lisp_Object attrs;
6408
6409         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6410         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6411             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6412           coding_attrs_list = Fcons (attrs, coding_attrs_list);
6413       }
6414
6415   if (STRINGP (start))
6416     p = pbeg = XSTRING (start)->data;
6417   else
6418     p = pbeg = BYTE_POS_ADDR (start_byte);
6419   pend = p + (end_byte - start_byte);
6420
6421   while (p < pend && ASCII_BYTE_P (*p)) p++;
6422   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6423
6424   while (p < pend)
6425     {
6426       if (ASCII_BYTE_P (*p))
6427         p++;
6428       else
6429         {
6430           c = STRING_CHAR_ADVANCE (p);
6431
6432           charset_map_loaded = 0;
6433           for (tail = coding_attrs_list; CONSP (tail);)
6434             {
6435               elt = XCAR (tail);
6436               if (NILP (elt))
6437                 tail = XCDR (tail);
6438               else if (char_encodable_p (c, elt))
6439                 tail = XCDR (tail);
6440               else if (CONSP (XCDR (tail)))
6441                 {
6442                   XSETCAR (tail, XCAR (XCDR (tail)));
6443                   XSETCDR (tail, XCDR (XCDR (tail)));
6444                 }
6445               else
6446                 {
6447                   XSETCAR (tail, Qnil);
6448                   tail = XCDR (tail);
6449                 }
6450             }
6451           if (charset_map_loaded)
6452             {
6453               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6454
6455               if (STRINGP (start))
6456                 pbeg = XSTRING (start)->data;
6457               else
6458                 pbeg = BYTE_POS_ADDR (start_byte);
6459               p = pbeg + p_offset;
6460               pend = pbeg + pend_offset;
6461             }
6462         }
6463     }
6464
6465   safe_codings = Qnil;
6466   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6467     if (! NILP (XCAR (tail)))
6468       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6469
6470   return safe_codings;
6471 }
6472
6473
6474 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6475        Scheck_coding_systems_region, 3, 3, 0,
6476        doc: /* Check if the region is encodable by coding systems.
6477
6478 START and END are buffer positions specifying the region.
6479 CODING-SYSTEM-LIST is a list of coding systems to check.
6480
6481 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6482 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6483 whole region, POS0, POS1, ... are buffer positions where non-encodable
6484 characters are found.
6485
6486 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6487 value is nil.
6488
6489 START may be a string.  In that case, check if the string is
6490 encodable, and the value contains indices to the string instead of
6491 buffer positions.  END is ignored.  */)
6492      (start, end, coding_system_list)
6493      Lisp_Object start, end, coding_system_list;
6494 {
6495   Lisp_Object list;
6496   EMACS_INT start_byte, end_byte;
6497   int pos;
6498   unsigned char *p, *pbeg, *pend;
6499   int c;
6500   Lisp_Object tail, elt;
6501
6502   if (STRINGP (start))
6503     {
6504       if (!STRING_MULTIBYTE (start)
6505           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6506         return Qnil;
6507       start_byte = 0;
6508       end_byte = STRING_BYTES (XSTRING (start));
6509       pos = 0;
6510     }
6511   else
6512     {
6513       CHECK_NUMBER_COERCE_MARKER (start);
6514       CHECK_NUMBER_COERCE_MARKER (end);
6515       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6516         args_out_of_range (start, end);
6517       if (NILP (current_buffer->enable_multibyte_characters))
6518         return Qnil;
6519       start_byte = CHAR_TO_BYTE (XINT (start));
6520       end_byte = CHAR_TO_BYTE (XINT (end));
6521       if (XINT (end) - XINT (start) == end_byte - start_byte)
6522         return Qt;
6523
6524       if (start < GPT && end > GPT)
6525         {
6526           if ((GPT - start) < (end - GPT))
6527             move_gap_both (start, start_byte);
6528           else
6529             move_gap_both (end, end_byte);
6530         }
6531       pos = start;
6532     }
6533
6534   list = Qnil;
6535   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6536     {
6537       elt = XCAR (tail);
6538       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6539                                        Qnil)),
6540                     list);
6541     }
6542
6543   if (STRINGP (start))
6544     p = pbeg = XSTRING (start)->data;
6545   else
6546     p = pbeg = BYTE_POS_ADDR (start_byte);
6547   pend = p + (end_byte - start_byte);
6548
6549   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6550   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6551
6552   while (p < pend)
6553     {
6554       if (ASCII_BYTE_P (*p))
6555         p++;
6556       else
6557         {
6558           c = STRING_CHAR_ADVANCE (p);
6559
6560           charset_map_loaded = 0;
6561           for (tail = list; CONSP (tail); tail = XCDR (tail))
6562             {
6563               elt = XCDR (XCAR (tail));
6564               if (! char_encodable_p (c, XCAR (elt)))
6565                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6566             }
6567           if (charset_map_loaded)
6568             {
6569               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6570
6571               if (STRINGP (start))
6572                 pbeg = XSTRING (start)->data;
6573               else
6574                 pbeg = BYTE_POS_ADDR (start_byte);
6575               p = pbeg + p_offset;
6576               pend = pbeg + pend_offset;
6577             }
6578         }
6579       pos++;
6580     }
6581
6582   tail = list;
6583   list = Qnil;
6584   for (; CONSP (tail); tail = XCDR (tail))
6585     {
6586       elt = XCAR (tail);
6587       if (CONSP (XCDR (XCDR (elt))))
6588         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6589                       list);
6590     }
6591
6592   return list;
6593 }
6594
6595
6596
6597 Lisp_Object
6598 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6599      Lisp_Object start, end, coding_system, dst_object;
6600      int encodep, norecord;
6601 {
6602   struct coding_system coding;
6603   EMACS_INT from, from_byte, to, to_byte;
6604   Lisp_Object src_object;
6605
6606   CHECK_NUMBER_COERCE_MARKER (start);
6607   CHECK_NUMBER_COERCE_MARKER (end);
6608   if (NILP (coding_system))
6609     coding_system = Qno_conversion;
6610   else
6611     CHECK_CODING_SYSTEM (coding_system);
6612   src_object = Fcurrent_buffer ();
6613   if (NILP (dst_object))
6614     dst_object = src_object;
6615   else if (! EQ (dst_object, Qt))
6616     CHECK_BUFFER (dst_object);
6617
6618   validate_region (&start, &end);
6619   from = XFASTINT (start);
6620   from_byte = CHAR_TO_BYTE (from);
6621   to = XFASTINT (end);
6622   to_byte = CHAR_TO_BYTE (to);
6623
6624   setup_coding_system (coding_system, &coding);
6625   coding.mode |= CODING_MODE_LAST_BLOCK;
6626
6627   if (encodep)
6628     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6629                           dst_object);
6630   else
6631     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6632                           dst_object);
6633   if (! norecord)
6634     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6635
6636   if (coding.result != CODING_RESULT_SUCCESS)
6637     error ("Code conversion error: %d", coding.result);
6638
6639   return (BUFFERP (dst_object)
6640           ? make_number (coding.produced_char)
6641           : coding.dst_object);
6642 }
6643
6644
6645 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6646        3, 4, "r\nzCoding system: ",
6647        doc: /* Decode the current region from the specified coding system.
6648 When called from a program, takes four arguments:
6649         START, END, CODING-SYSTEM, and DESTINATION.
6650 START and END are buffer positions.
6651
6652 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6653 If nil, the region between START and END is replace by the decoded text.
6654 If buffer, the decoded text is inserted in the buffer.
6655 If t, the decoded text is returned.
6656
6657 This function sets `last-coding-system-used' to the precise coding system
6658 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6659 not fully specified.)
6660 It returns the length of the decoded text.  */)
6661      (start, end, coding_system, destination)
6662      Lisp_Object start, end, coding_system, destination;
6663 {
6664   return code_convert_region (start, end, coding_system, destination, 0, 0);
6665 }
6666
6667 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6668        3, 4, "r\nzCoding system: ",
6669        doc: /* Encode the current region by specified coding system.
6670 When called from a program, takes three arguments:
6671 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6672
6673 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6674 If nil, the region between START and END is replace by the encoded text.
6675 If buffer, the encoded text is inserted in the buffer.
6676 If t, the encoded text is returned.
6677
6678 This function sets `last-coding-system-used' to the precise coding system
6679 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6680 not fully specified.)
6681 It returns the length of the encoded text.  */)
6682   (start, end, coding_system, destination)
6683      Lisp_Object start, end, coding_system, destination;
6684 {
6685   return code_convert_region (start, end, coding_system, destination, 1, 0);
6686 }
6687
6688 Lisp_Object
6689 code_convert_string (string, coding_system, dst_object,
6690                      encodep, nocopy, norecord)
6691      Lisp_Object string, coding_system, dst_object;
6692      int encodep, nocopy, norecord;
6693 {
6694   struct coding_system coding;
6695   EMACS_INT chars, bytes;
6696
6697   CHECK_STRING (string);
6698   if (NILP (coding_system))
6699     {
6700       if (! norecord)
6701         Vlast_coding_system_used = Qno_conversion;
6702       if (NILP (dst_object))
6703         return (nocopy ? Fcopy_sequence (string) : string);
6704     }
6705
6706   if (NILP (coding_system))
6707     coding_system = Qno_conversion;
6708   else
6709     CHECK_CODING_SYSTEM (coding_system);
6710   if (NILP (dst_object))
6711     dst_object = Qt;
6712   else if (! EQ (dst_object, Qt))
6713     CHECK_BUFFER (dst_object);
6714
6715   setup_coding_system (coding_system, &coding);
6716   coding.mode |= CODING_MODE_LAST_BLOCK;
6717   chars = XSTRING (string)->size;
6718   bytes = STRING_BYTES (XSTRING (string));
6719   if (encodep)
6720     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6721   else
6722     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6723   if (! norecord)
6724     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6725
6726   if (coding.result != CODING_RESULT_SUCCESS)
6727     error ("Code conversion error: %d", coding.result);
6728
6729   return (BUFFERP (dst_object)
6730           ? make_number (coding.produced_char)
6731           : coding.dst_object);
6732 }
6733
6734
6735 /* Encode or decode STRING according to CODING_SYSTEM.
6736    Do not set Vlast_coding_system_used.
6737
6738    This function is called only from macros DECODE_FILE and
6739    ENCODE_FILE, thus we ignore character composition.  */
6740
6741 Lisp_Object
6742 code_convert_string_norecord (string, coding_system, encodep)
6743      Lisp_Object string, coding_system;
6744      int encodep;
6745 {
6746   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6747 }
6748
6749
6750 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6751        2, 4, 0,
6752        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6753
6754 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6755 if the decoding operation is trivial.
6756
6757 Optional fourth arg BUFFER non-nil meant that the decoded text is
6758 inserted in BUFFER instead of returned as a astring.  In this case,
6759 the return value is BUFFER.
6760
6761 This function sets `last-coding-system-used' to the precise coding system
6762 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6763 not fully specified.  */)
6764   (string, coding_system, nocopy, buffer)
6765      Lisp_Object string, coding_system, nocopy, buffer;
6766 {
6767   return code_convert_string (string, coding_system, buffer,
6768                               0, ! NILP (nocopy), 0);
6769 }
6770
6771 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6772        2, 4, 0,
6773        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6774
6775 Optional third arg NOCOPY non-nil means it is OK to return STRING
6776 itself if the encoding operation is trivial.
6777
6778 Optional fourth arg BUFFER non-nil meant that the encoded text is
6779 inserted in BUFFER instead of returned as a astring.  In this case,
6780 the return value is BUFFER.
6781
6782 This function sets `last-coding-system-used' to the precise coding system
6783 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6784 not fully specified.)  */)
6785      (string, coding_system, nocopy, buffer)
6786      Lisp_Object string, coding_system, nocopy, buffer;
6787 {
6788   return code_convert_string (string, coding_system, buffer,
6789                               nocopy, ! NILP (nocopy), 1);
6790 }
6791
6792 \f
6793 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6794        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6795 Return the corresponding character.  */)
6796      (code)
6797      Lisp_Object code;
6798 {
6799   Lisp_Object spec, attrs, val;
6800   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6801   int c;
6802
6803   CHECK_NATNUM (code);
6804   c = XFASTINT (code);
6805   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6806   attrs = AREF (spec, 0);
6807
6808   if (ASCII_BYTE_P (c)
6809       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6810     return code;
6811
6812   val = CODING_ATTR_CHARSET_LIST (attrs);
6813   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6814   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6815   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
6816
6817   if (c <= 0x7F)
6818     charset = charset_roman;
6819   else if (c >= 0xA0 && c < 0xDF)
6820     {
6821       charset = charset_kana;
6822       c -= 0x80;
6823     }
6824   else
6825     {
6826       int s1 = c >> 8, s2 = c & 0x7F;
6827
6828       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6829           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6830         error ("Invalid code: %d", code);
6831       SJIS_TO_JIS (c);
6832       charset = charset_kanji;
6833     }
6834   c = DECODE_CHAR (charset, c);
6835   if (c < 0)
6836     error ("Invalid code: %d", code);
6837   return make_number (c);
6838 }
6839
6840
6841 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6842        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6843 Return the corresponding code in SJIS.  */)
6844      (ch)
6845     Lisp_Object ch;
6846 {
6847   Lisp_Object spec, attrs, charset_list;
6848   int c;
6849   struct charset *charset;
6850   unsigned code;
6851
6852   CHECK_CHARACTER (ch);
6853   c = XFASTINT (ch);
6854   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6855   attrs = AREF (spec, 0);
6856
6857   if (ASCII_CHAR_P (c)
6858       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6859     return ch;
6860
6861   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6862   charset = char_charset (c, charset_list, &code);
6863   if (code == CHARSET_INVALID_CODE (charset))
6864     error ("Can't encode by shift_jis encoding: %d", c);
6865   JIS_TO_SJIS (code);
6866
6867   return make_number (code);
6868 }
6869
6870 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6871        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6872 Return the corresponding character.  */)
6873      (code)
6874      Lisp_Object code;
6875 {
6876   Lisp_Object spec, attrs, val;
6877   struct charset *charset_roman, *charset_big5, *charset;
6878   int c;
6879
6880   CHECK_NATNUM (code);
6881   c = XFASTINT (code);
6882   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6883   attrs = AREF (spec, 0);
6884
6885   if (ASCII_BYTE_P (c)
6886       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6887     return code;
6888
6889   val = CODING_ATTR_CHARSET_LIST (attrs);
6890   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6891   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6892
6893   if (c <= 0x7F)
6894     charset = charset_roman;
6895   else
6896     {
6897       int b1 = c >> 8, b2 = c & 0x7F;
6898       if (b1 < 0xA1 || b1 > 0xFE
6899           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6900         error ("Invalid code: %d", code);
6901       charset = charset_big5;
6902     }
6903   c = DECODE_CHAR (charset, (unsigned )c);
6904   if (c < 0)
6905     error ("Invalid code: %d", code);
6906   return make_number (c);
6907 }
6908
6909 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6910        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6911 Return the corresponding character code in Big5.  */)
6912      (ch)
6913      Lisp_Object ch;
6914 {
6915   Lisp_Object spec, attrs, charset_list;
6916   struct charset *charset;
6917   int c;
6918   unsigned code;
6919
6920   CHECK_CHARACTER (ch);
6921   c = XFASTINT (ch);
6922   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6923   attrs = AREF (spec, 0);
6924   if (ASCII_CHAR_P (c)
6925       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6926     return ch;
6927
6928   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6929   charset = char_charset (c, charset_list, &code);
6930   if (code == CHARSET_INVALID_CODE (charset))
6931     error ("Can't encode by Big5 encoding: %d", c);
6932
6933   return make_number (code);
6934 }
6935
6936 \f
6937 DEFUN ("set-terminal-coding-system-internal",
6938        Fset_terminal_coding_system_internal,
6939        Sset_terminal_coding_system_internal, 1, 1, 0,
6940        doc: /* Internal use only.  */)
6941      (coding_system)
6942      Lisp_Object coding_system;
6943 {
6944   CHECK_SYMBOL (coding_system);
6945   setup_coding_system (Fcheck_coding_system (coding_system),
6946                         &terminal_coding);
6947
6948   /* We had better not send unsafe characters to terminal.  */
6949   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6950   /* Characer composition should be disabled.  */
6951   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6952   terminal_coding.src_multibyte = 1;
6953   terminal_coding.dst_multibyte = 0;
6954   return Qnil;
6955 }
6956
6957 DEFUN ("set-safe-terminal-coding-system-internal",
6958        Fset_safe_terminal_coding_system_internal,
6959        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6960        doc: /* Internal use only.  */)
6961      (coding_system)
6962      Lisp_Object coding_system;
6963 {
6964   CHECK_SYMBOL (coding_system);
6965   setup_coding_system (Fcheck_coding_system (coding_system),
6966                        &safe_terminal_coding);
6967   /* Characer composition should be disabled.  */
6968   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6969   safe_terminal_coding.src_multibyte = 1;
6970   safe_terminal_coding.dst_multibyte = 0;
6971   return Qnil;
6972 }
6973
6974 DEFUN ("terminal-coding-system",
6975        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6976        doc: /* Return coding system specified for terminal output.  */)
6977      ()
6978 {
6979   return CODING_ID_NAME (terminal_coding.id);
6980 }
6981
6982 DEFUN ("set-keyboard-coding-system-internal",
6983        Fset_keyboard_coding_system_internal,
6984        Sset_keyboard_coding_system_internal, 1, 1, 0,
6985        doc: /* Internal use only.  */)
6986      (coding_system)
6987      Lisp_Object coding_system;
6988 {
6989   CHECK_SYMBOL (coding_system);
6990   setup_coding_system (Fcheck_coding_system (coding_system),
6991                        &keyboard_coding);
6992   /* Characer composition should be disabled.  */
6993   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6994   return Qnil;
6995 }
6996
6997 DEFUN ("keyboard-coding-system",
6998        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6999        doc: /* Return coding system specified for decoding keyboard input.  */)
7000      ()
7001 {
7002   return CODING_ID_NAME (keyboard_coding.id);
7003 }
7004
7005 \f
7006 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7007        Sfind_operation_coding_system,  1, MANY, 0,
7008        doc: /* Choose a coding system for an operation based on the target name.
7009 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7010 DECODING-SYSTEM is the coding system to use for decoding
7011 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7012 for encoding (in case OPERATION does encoding).
7013
7014 The first argument OPERATION specifies an I/O primitive:
7015   For file I/O, `insert-file-contents' or `write-region'.
7016   For process I/O, `call-process', `call-process-region', or `start-process'.
7017   For network I/O, `open-network-stream'.
7018
7019 The remaining arguments should be the same arguments that were passed
7020 to the primitive.  Depending on which primitive, one of those arguments
7021 is selected as the TARGET.  For example, if OPERATION does file I/O,
7022 whichever argument specifies the file name is TARGET.
7023
7024 TARGET has a meaning which depends on OPERATION:
7025   For file I/O, TARGET is a file name.
7026   For process I/O, TARGET is a process name.
7027   For network I/O, TARGET is a service name or a port number
7028
7029 This function looks up what specified for TARGET in,
7030 `file-coding-system-alist', `process-coding-system-alist',
7031 or `network-coding-system-alist' depending on OPERATION.
7032 They may specify a coding system, a cons of coding systems,
7033 or a function symbol to call.
7034 In the last case, we call the function with one argument,
7035 which is a list of all the arguments given to this function.
7036
7037 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7038      (nargs, args)
7039      int nargs;
7040      Lisp_Object *args;
7041 {
7042   Lisp_Object operation, target_idx, target, val;
7043   register Lisp_Object chain;
7044
7045   if (nargs < 2)
7046     error ("Too few arguments");
7047   operation = args[0];
7048   if (!SYMBOLP (operation)
7049       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7050     error ("Invalid first arguement");
7051   if (nargs < 1 + XINT (target_idx))
7052     error ("Too few arguments for operation: %s",
7053            XSYMBOL (operation)->name->data);
7054   target = args[XINT (target_idx) + 1];
7055   if (!(STRINGP (target)
7056         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7057     error ("Invalid %dth argument", XINT (target_idx) + 1);
7058
7059   chain = ((EQ (operation, Qinsert_file_contents)
7060             || EQ (operation, Qwrite_region))
7061            ? Vfile_coding_system_alist
7062            : (EQ (operation, Qopen_network_stream)
7063               ? Vnetwork_coding_system_alist
7064               : Vprocess_coding_system_alist));
7065   if (NILP (chain))
7066     return Qnil;
7067
7068   for (; CONSP (chain); chain = XCDR (chain))
7069     {
7070       Lisp_Object elt;
7071
7072       elt = XCAR (chain);
7073       if (CONSP (elt)
7074           && ((STRINGP (target)
7075                && STRINGP (XCAR (elt))
7076                && fast_string_match (XCAR (elt), target) >= 0)
7077               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7078         {
7079           val = XCDR (elt);
7080           /* Here, if VAL is both a valid coding system and a valid
7081              function symbol, we return VAL as a coding system.  */
7082           if (CONSP (val))
7083             return val;
7084           if (! SYMBOLP (val))
7085             return Qnil;
7086           if (! NILP (Fcoding_system_p (val)))
7087             return Fcons (val, val);
7088           if (! NILP (Ffboundp (val)))
7089             {
7090               val = call1 (val, Flist (nargs, args));
7091               if (CONSP (val))
7092                 return val;
7093               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7094                 return Fcons (val, val);
7095             }
7096           return Qnil;
7097         }
7098     }
7099   return Qnil;
7100 }
7101
7102 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7103        Sset_coding_system_priority, 1, MANY, 0,
7104        doc: /* Put higher priority to coding systems of the arguments.  */)
7105      (nargs, args)
7106      int nargs;
7107      Lisp_Object *args;
7108 {
7109   int i, j;
7110   int changed[coding_category_max];
7111   enum coding_category priorities[coding_category_max];
7112
7113   bzero (changed, sizeof changed);
7114
7115   for (i = j = 0; i < nargs; i++)
7116     {
7117       enum coding_category category;
7118       Lisp_Object spec, attrs;
7119
7120       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7121       attrs = AREF (spec, 0);
7122       category = XINT (CODING_ATTR_CATEGORY (attrs));
7123       if (changed[category])
7124         /* Ignore this coding system because a coding system of the
7125            same category already had a higher priority.  */
7126         continue;
7127       changed[category] = 1;
7128       priorities[j++] = category;
7129       if (coding_categories[category].id >= 0
7130           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7131         setup_coding_system (args[i], &coding_categories[category]);
7132     }
7133
7134   /* Now we have decided top J priorities.  Reflect the order of the
7135      original priorities to the remaining priorities.  */
7136
7137   for (i = j, j = 0; i < coding_category_max; i++, j++)
7138     {
7139       while (j < coding_category_max
7140              && changed[coding_priorities[j]])
7141         j++;
7142       if (j == coding_category_max)
7143         abort ();
7144       priorities[i] = coding_priorities[j];
7145     }
7146
7147   bcopy (priorities, coding_priorities, sizeof priorities);
7148   return Qnil;
7149 }
7150
7151 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7152        Scoding_system_priority_list, 0, 1, 0,
7153        doc: /* Return a list of coding systems ordered by their priorities.  */)
7154      (highestp)
7155      Lisp_Object highestp;
7156 {
7157   int i;
7158   Lisp_Object val;
7159
7160   for (i = 0, val = Qnil; i < coding_category_max; i++)
7161     {
7162       enum coding_category category = coding_priorities[i];
7163       int id = coding_categories[category].id;
7164       Lisp_Object attrs;
7165
7166       if (id < 0)
7167         continue;
7168       attrs = CODING_ID_ATTRS (id);
7169       if (! NILP (highestp))
7170         return CODING_ATTR_BASE_NAME (attrs);
7171       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7172     }
7173   return Fnreverse (val);
7174 }
7175
7176 static Lisp_Object
7177 make_subsidiaries (base)
7178      Lisp_Object base;
7179 {
7180   Lisp_Object subsidiaries;
7181   char *suffixes[] = { "-unix", "-dos", "-mac" };
7182   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7183   char *buf = (char *) alloca (base_name_len + 6);
7184   int i;
7185
7186   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7187   subsidiaries = Fmake_vector (make_number (3), Qnil);
7188   for (i = 0; i < 3; i++)
7189     {
7190       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7191       ASET (subsidiaries, i, intern (buf));
7192     }
7193   return subsidiaries;
7194 }
7195
7196
7197 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7198        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7199        doc: /* For internal use only.  */)
7200      (nargs, args)
7201      int nargs;
7202      Lisp_Object *args;
7203 {
7204   Lisp_Object name;
7205   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7206   Lisp_Object attrs;            /* Vector of attributes.  */
7207   Lisp_Object eol_type;
7208   Lisp_Object aliases;
7209   Lisp_Object coding_type, charset_list, safe_charsets;
7210   enum coding_category category;
7211   Lisp_Object tail, val;
7212   int max_charset_id = 0;
7213   int i;
7214
7215   if (nargs < coding_arg_max)
7216     goto short_args;
7217
7218   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7219
7220   name = args[coding_arg_name];
7221   CHECK_SYMBOL (name);
7222   CODING_ATTR_BASE_NAME (attrs) = name;
7223
7224   val = args[coding_arg_mnemonic];
7225   if (! STRINGP (val))
7226     CHECK_CHARACTER (val);
7227   CODING_ATTR_MNEMONIC (attrs) = val;
7228
7229   coding_type = args[coding_arg_coding_type];
7230   CHECK_SYMBOL (coding_type);
7231   CODING_ATTR_TYPE (attrs) = coding_type;
7232
7233   charset_list = args[coding_arg_charset_list];
7234   if (SYMBOLP (charset_list))
7235     {
7236       if (EQ (charset_list, Qiso_2022))
7237         {
7238           if (! EQ (coding_type, Qiso_2022))
7239             error ("Invalid charset-list");
7240           charset_list = Viso_2022_charset_list;
7241         }
7242       else if (EQ (charset_list, Qemacs_mule))
7243         {
7244           if (! EQ (coding_type, Qemacs_mule))
7245             error ("Invalid charset-list");
7246           charset_list = Vemacs_mule_charset_list;
7247         }
7248       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7249         if (max_charset_id < XFASTINT (XCAR (tail)))
7250           max_charset_id = XFASTINT (XCAR (tail));
7251     }
7252   else
7253     {
7254       charset_list = Fcopy_sequence (charset_list);
7255       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7256         {
7257           struct charset *charset;
7258
7259           val = Fcar (tail);
7260           CHECK_CHARSET_GET_CHARSET (val, charset);
7261           if (EQ (coding_type, Qiso_2022)
7262               ? CHARSET_ISO_FINAL (charset) < 0
7263               : EQ (coding_type, Qemacs_mule)
7264               ? CHARSET_EMACS_MULE_ID (charset) < 0
7265               : 0)
7266             error ("Can't handle charset `%s'",
7267                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7268
7269           XCAR (tail) = make_number (charset->id);
7270           if (max_charset_id < charset->id)
7271             max_charset_id = charset->id;
7272         }
7273     }
7274   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7275
7276   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7277                                 make_number (255));
7278   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7279     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7280   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7281
7282   val = args[coding_arg_decode_translation_table];
7283   if (! NILP (val))
7284     CHECK_CHAR_TABLE (val);
7285   CODING_ATTR_DECODE_TBL (attrs) = val;
7286
7287   val = args[coding_arg_encode_translation_table];
7288   if (! NILP (val))
7289     CHECK_CHAR_TABLE (val);
7290   CODING_ATTR_ENCODE_TBL (attrs) = val;
7291
7292   val = args[coding_arg_post_read_conversion];
7293   CHECK_SYMBOL (val);
7294   CODING_ATTR_POST_READ (attrs) = val;
7295
7296   val = args[coding_arg_pre_write_conversion];
7297   CHECK_SYMBOL (val);
7298   CODING_ATTR_PRE_WRITE (attrs) = val;
7299
7300   val = args[coding_arg_default_char];
7301   if (NILP (val))
7302     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7303   else
7304     {
7305       CHECK_CHARACTER (val);
7306       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7307     }
7308
7309   val = args[coding_arg_plist];
7310   CHECK_LIST (val);
7311   CODING_ATTR_PLIST (attrs) = val;
7312
7313   if (EQ (coding_type, Qcharset))
7314     {
7315       val = Fmake_vector (make_number (256), Qnil);
7316
7317       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7318         {
7319           struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7320           int idx = (CHARSET_DIMENSION (charset) - 1) * 4;
7321
7322           for (i = charset->code_space[idx];
7323                i <= charset->code_space[idx + 1]; i++)
7324             {
7325               if (NILP (AREF (val, i)))
7326                 ASET (val, i, XCAR (tail));
7327               else
7328                 error ("Charsets conflicts in the first byte");
7329             }
7330         }
7331       ASET (attrs, coding_attr_charset_valids, val);
7332       category = coding_category_charset;
7333     }
7334   else if (EQ (coding_type, Qccl))
7335     {
7336       Lisp_Object valids;
7337
7338       if (nargs < coding_arg_ccl_max)
7339         goto short_args;
7340
7341       val = args[coding_arg_ccl_decoder];
7342       CHECK_CCL_PROGRAM (val);
7343       if (VECTORP (val))
7344         val = Fcopy_sequence (val);
7345       ASET (attrs, coding_attr_ccl_decoder, val);
7346
7347       val = args[coding_arg_ccl_encoder];
7348       CHECK_CCL_PROGRAM (val);
7349       if (VECTORP (val))
7350         val = Fcopy_sequence (val);
7351       ASET (attrs, coding_attr_ccl_encoder, val);
7352
7353       val = args[coding_arg_ccl_valids];
7354       valids = Fmake_string (make_number (256), make_number (0));
7355       for (tail = val; !NILP (tail); tail = Fcdr (tail))
7356         {
7357           val = Fcar (tail);
7358           if (INTEGERP (val))
7359             ASET (valids, XINT (val), 1);
7360           else
7361             {
7362               int from, to;
7363
7364               CHECK_CONS (val);
7365               CHECK_NUMBER (XCAR (val));
7366               CHECK_NUMBER (XCDR (val));
7367               from = XINT (XCAR (val));
7368               to = XINT (XCDR (val));
7369               for (i = from; i <= to; i++)
7370                 ASET (valids, i, 1);
7371             }
7372         }
7373       ASET (attrs, coding_attr_ccl_valids, valids);
7374
7375       category = coding_category_ccl;
7376     }
7377   else if (EQ (coding_type, Qutf_16))
7378     {
7379       Lisp_Object bom, endian;
7380
7381       if (nargs < coding_arg_utf16_max)
7382         goto short_args;
7383
7384       bom = args[coding_arg_utf16_bom];
7385       if (! NILP (bom) && ! EQ (bom, Qt))
7386         {
7387           CHECK_CONS (bom);
7388           CHECK_CODING_SYSTEM (XCAR (bom));
7389           CHECK_CODING_SYSTEM (XCDR (bom));
7390         }
7391       ASET (attrs, coding_attr_utf_16_bom, bom);
7392
7393       endian = args[coding_arg_utf16_endian];
7394       ASET (attrs, coding_attr_utf_16_endian, endian);
7395
7396       category = (CONSP (bom)
7397                   ? coding_category_utf_16_auto
7398                   : NILP (bom)
7399                   ? (NILP (endian)
7400                      ? coding_category_utf_16_be_nosig
7401                      : coding_category_utf_16_le_nosig)
7402                   : (NILP (endian)
7403                      ? coding_category_utf_16_be
7404                      : coding_category_utf_16_le));
7405     }
7406   else if (EQ (coding_type, Qiso_2022))
7407     {
7408       Lisp_Object initial, reg_usage, request, flags;
7409       int i, id;
7410
7411       if (nargs < coding_arg_iso2022_max)
7412         goto short_args;
7413
7414       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7415       CHECK_VECTOR (initial);
7416       for (i = 0; i < 4; i++)
7417         {
7418           val = Faref (initial, make_number (i));
7419           if (! NILP (val))
7420             {
7421               CHECK_CHARSET_GET_ID (val, id);
7422               ASET (initial, i, make_number (id));
7423             }
7424           else
7425             ASET (initial, i, make_number (-1));
7426         }
7427
7428       reg_usage = args[coding_arg_iso2022_reg_usage];
7429       CHECK_CONS (reg_usage);
7430       CHECK_NATNUM (XCAR (reg_usage));
7431       CHECK_NATNUM (XCDR (reg_usage));
7432
7433       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7434       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7435         {
7436           int id;
7437
7438           val = Fcar (tail);
7439           CHECK_CONS (val);
7440           CHECK_CHARSET_GET_ID (XCAR (val), id);
7441           CHECK_NATNUM (XCDR (val));
7442           if (XINT (XCDR (val)) >= 4)
7443             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7444           XCAR (val) = make_number (id);
7445         }
7446
7447       flags = args[coding_arg_iso2022_flags];
7448       CHECK_NATNUM (flags);
7449       i = XINT (flags);
7450       if (EQ (args[coding_arg_charset_list], Qiso_2022))
7451         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7452
7453       ASET (attrs, coding_attr_iso_initial, initial);
7454       ASET (attrs, coding_attr_iso_usage, reg_usage);
7455       ASET (attrs, coding_attr_iso_request, request);
7456       ASET (attrs, coding_attr_iso_flags, flags);
7457       setup_iso_safe_charsets (attrs);
7458
7459       if (i & CODING_ISO_FLAG_SEVEN_BITS)
7460         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7461                           | CODING_ISO_FLAG_SINGLE_SHIFT))
7462                     ? coding_category_iso_7_else
7463                     : EQ (args[coding_arg_charset_list], Qiso_2022)
7464                     ? coding_category_iso_7
7465                     : coding_category_iso_7_tight);
7466       else
7467         {
7468           int id = XINT (AREF (initial, 1));
7469
7470           category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7471                              | CODING_ISO_FLAG_SINGLE_SHIFT))
7472                        || EQ (args[coding_arg_charset_list], Qiso_2022)
7473                        || id < 0)
7474                       ? coding_category_iso_8_else
7475                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7476                       ? coding_category_iso_8_1
7477                       : coding_category_iso_8_2);
7478         }
7479     }
7480   else if (EQ (coding_type, Qemacs_mule))
7481     {
7482       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7483         ASET (attrs, coding_attr_emacs_mule_full, Qt);
7484
7485       category = coding_category_emacs_mule;
7486     }
7487   else if (EQ (coding_type, Qshift_jis))
7488     {
7489
7490       struct charset *charset;
7491
7492       if (XINT (Flength (charset_list)) != 3)
7493         error ("There should be just three charsets");
7494
7495       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7496       if (CHARSET_DIMENSION (charset) != 1)
7497         error ("Dimension of charset %s is not one",
7498                XSYMBOL (CHARSET_NAME (charset))->name->data);
7499
7500       charset_list = XCDR (charset_list);
7501       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7502       if (CHARSET_DIMENSION (charset) != 1)
7503         error ("Dimension of charset %s is not one",
7504                XSYMBOL (CHARSET_NAME (charset))->name->data);
7505
7506       charset_list = XCDR (charset_list);
7507       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7508       if (CHARSET_DIMENSION (charset) != 2)
7509         error ("Dimension of charset %s is not two",
7510                XSYMBOL (CHARSET_NAME (charset))->name->data);
7511
7512       category = coding_category_sjis;
7513       Vsjis_coding_system = name;
7514     }
7515   else if (EQ (coding_type, Qbig5))
7516     {
7517       struct charset *charset;
7518
7519       if (XINT (Flength (charset_list)) != 2)
7520         error ("There should be just two charsets");
7521
7522       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7523       if (CHARSET_DIMENSION (charset) != 1)
7524         error ("Dimension of charset %s is not one",
7525                XSYMBOL (CHARSET_NAME (charset))->name->data);
7526
7527       charset_list = XCDR (charset_list);
7528       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7529       if (CHARSET_DIMENSION (charset) != 2)
7530         error ("Dimension of charset %s is not two",
7531                XSYMBOL (CHARSET_NAME (charset))->name->data);
7532
7533       category = coding_category_big5;
7534       Vbig5_coding_system = name;
7535     }
7536   else if (EQ (coding_type, Qraw_text))
7537     category = coding_category_raw_text;
7538   else if (EQ (coding_type, Qutf_8))
7539     category = coding_category_utf_8;
7540   else if (EQ (coding_type, Qundecided))
7541     category = coding_category_undecided;
7542   else
7543     error ("Invalid coding system type: %s",
7544            XSYMBOL (coding_type)->name->data);
7545
7546   CODING_ATTR_CATEGORY (attrs) = make_number (category);
7547
7548   eol_type = args[coding_arg_eol_type];
7549   if (! NILP (eol_type)
7550       && ! EQ (eol_type, Qunix)
7551       && ! EQ (eol_type, Qdos)
7552       && ! EQ (eol_type, Qmac))
7553     error ("Invalid eol-type");
7554
7555   aliases = Fcons (name, Qnil);
7556
7557   if (NILP (eol_type))
7558     {
7559       eol_type = make_subsidiaries (name);
7560       for (i = 0; i < 3; i++)
7561         {
7562           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7563
7564           this_name = AREF (eol_type, i);
7565           this_aliases = Fcons (this_name, Qnil);
7566           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7567           this_spec = Fmake_vector (make_number (3), attrs);
7568           ASET (this_spec, 1, this_aliases);
7569           ASET (this_spec, 2, this_eol_type);
7570           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7571           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7572           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7573                                         Vcoding_system_alist);
7574         }
7575     }
7576
7577   spec_vec = Fmake_vector (make_number (3), attrs);
7578   ASET (spec_vec, 1, aliases);
7579   ASET (spec_vec, 2, eol_type);
7580
7581   Fputhash (name, spec_vec, Vcoding_system_hash_table);
7582   Vcoding_system_list = Fcons (name, Vcoding_system_list);
7583   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7584                                 Vcoding_system_alist);
7585
7586   {
7587     int id = coding_categories[category].id;
7588
7589     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7590       setup_coding_system (name, &coding_categories[category]);
7591   }
7592
7593   return Qnil;
7594
7595  short_args:
7596   return Fsignal (Qwrong_number_of_arguments,
7597                   Fcons (intern ("define-coding-system-internal"),
7598                          make_number (nargs)));
7599 }
7600
7601 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7602        Sdefine_coding_system_alias, 2, 2, 0,
7603        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
7604      (alias, coding_system)
7605      Lisp_Object alias, coding_system;
7606 {
7607   Lisp_Object spec, aliases, eol_type;
7608
7609   CHECK_SYMBOL (alias);
7610   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7611   aliases = AREF (spec, 1);
7612   while (!NILP (XCDR (aliases)))
7613     aliases = XCDR (aliases);
7614   XCDR (aliases) = Fcons (alias, Qnil);
7615
7616   eol_type = AREF (spec, 2);
7617   if (VECTORP (eol_type))
7618     {
7619       Lisp_Object subsidiaries;
7620       int i;
7621
7622       subsidiaries = make_subsidiaries (alias);
7623       for (i = 0; i < 3; i++)
7624         Fdefine_coding_system_alias (AREF (subsidiaries, i),
7625                                      AREF (eol_type, i));
7626
7627       ASET (spec, 2, subsidiaries);
7628     }
7629
7630   Fputhash (alias, spec, Vcoding_system_hash_table);
7631   Vcoding_system_alist = Fcons (Fcons (alias, Qnil), Vcoding_system_alist);
7632
7633   return Qnil;
7634 }
7635
7636 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7637        1, 1, 0,
7638        doc: /* Return the base of CODING-SYSTEM.
7639 Any alias or subsidiary coding systems are not base coding system.  */)
7640   (coding_system)
7641      Lisp_Object coding_system;
7642 {
7643   Lisp_Object spec, attrs;
7644
7645   if (NILP (coding_system))
7646     return (Qno_conversion);
7647   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7648   attrs = AREF (spec, 0);
7649   return CODING_ATTR_BASE_NAME (attrs);
7650 }
7651
7652 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7653        1, 1, 0,
7654        doc: "Return the property list of CODING-SYSTEM.")
7655      (coding_system)
7656      Lisp_Object coding_system;
7657 {
7658   Lisp_Object spec, attrs;
7659
7660   if (NILP (coding_system))
7661     coding_system = Qno_conversion;
7662   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7663   attrs = AREF (spec, 0);
7664   return CODING_ATTR_PLIST (attrs);
7665 }
7666
7667
7668 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7669        1, 1, 0,
7670        doc: /* Return the list of aliases of CODING-SYSTEM.
7671 A base coding system is what made by `define-coding-system'.
7672 Any alias nor subsidiary coding systems are not base coding system.  */)
7673      (coding_system)
7674      Lisp_Object coding_system;
7675 {
7676   Lisp_Object spec;
7677
7678   if (NILP (coding_system))
7679     coding_system = Qno_conversion;
7680   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7681   return AREF (spec, 2);
7682 }
7683
7684 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7685        Scoding_system_eol_type, 1, 1, 0,
7686        doc: /* Return eol-type of CODING-SYSTEM.
7687 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7688
7689 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7690 and CR respectively.
7691
7692 A vector value indicates that a format of end-of-line should be
7693 detected automatically.  Nth element of the vector is the subsidiary
7694 coding system whose eol-type is N.  */)
7695      (coding_system)
7696      Lisp_Object coding_system;
7697 {
7698   Lisp_Object spec, eol_type;
7699   int n;
7700
7701   if (NILP (coding_system))
7702     coding_system = Qno_conversion;
7703   if (! CODING_SYSTEM_P (coding_system))
7704     return Qnil;
7705   spec = CODING_SYSTEM_SPEC (coding_system);
7706   eol_type = AREF (spec, 2);
7707   if (VECTORP (eol_type))
7708     return Fcopy_sequence (eol_type);
7709   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7710   return make_number (n);
7711 }
7712
7713 #endif /* emacs */
7714
7715 \f
7716 /*** 9. Post-amble ***/
7717
7718 void
7719 init_coding_once ()
7720 {
7721   int i;
7722
7723   for (i = 0; i < coding_category_max; i++)
7724     {
7725       coding_categories[i].id = -1;
7726       coding_priorities[i] = i;
7727     }
7728
7729   /* ISO2022 specific initialize routine.  */
7730   for (i = 0; i < 0x20; i++)
7731     iso_code_class[i] = ISO_control_0;
7732   for (i = 0x21; i < 0x7F; i++)
7733     iso_code_class[i] = ISO_graphic_plane_0;
7734   for (i = 0x80; i < 0xA0; i++)
7735     iso_code_class[i] = ISO_control_1;
7736   for (i = 0xA1; i < 0xFF; i++)
7737     iso_code_class[i] = ISO_graphic_plane_1;
7738   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7739   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7740   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7741   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7742   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7743   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7744   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7745   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7746   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7747   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7748
7749   inhibit_pre_post_conversion = 0;
7750
7751   for (i = 0; i < 256; i++)
7752     {
7753       emacs_mule_bytes[i] = 1;
7754     }
7755 }
7756
7757 #ifdef emacs
7758
7759 void
7760 syms_of_coding ()
7761 {
7762   staticpro (&Vcoding_system_hash_table);
7763   Vcoding_system_hash_table = Fmakehash (Qeq);
7764
7765   staticpro (&Vsjis_coding_system);
7766   Vsjis_coding_system = Qnil;
7767
7768   staticpro (&Vbig5_coding_system);
7769   Vbig5_coding_system = Qnil;
7770
7771   staticpro (&Vcode_conversion_work_buf_list);
7772   Vcode_conversion_work_buf_list = Qnil;
7773
7774   staticpro (&Vcode_conversion_reused_work_buf);
7775   Vcode_conversion_reused_work_buf = Qnil;
7776
7777   DEFSYM (Qcharset, "charset");
7778   DEFSYM (Qtarget_idx, "target-idx");
7779   DEFSYM (Qcoding_system_history, "coding-system-history");
7780   Fset (Qcoding_system_history, Qnil);
7781
7782   /* Target FILENAME is the first argument.  */
7783   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7784   /* Target FILENAME is the third argument.  */
7785   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7786
7787   DEFSYM (Qcall_process, "call-process");
7788   /* Target PROGRAM is the first argument.  */
7789   Fput (Qcall_process, Qtarget_idx, make_number (0));
7790
7791   DEFSYM (Qcall_process_region, "call-process-region");
7792   /* Target PROGRAM is the third argument.  */
7793   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7794
7795   DEFSYM (Qstart_process, "start-process");
7796   /* Target PROGRAM is the third argument.  */
7797   Fput (Qstart_process, Qtarget_idx, make_number (2));
7798
7799   DEFSYM (Qopen_network_stream, "open-network-stream");
7800   /* Target SERVICE is the fourth argument.  */
7801   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7802
7803   DEFSYM (Qcoding_system, "coding-system");
7804   DEFSYM (Qcoding_aliases, "coding-aliases");
7805
7806   DEFSYM (Qeol_type, "eol-type");
7807   DEFSYM (Qunix, "unix");
7808   DEFSYM (Qdos, "dos");
7809   DEFSYM (Qmac, "mac");
7810
7811   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7812   DEFSYM (Qpost_read_conversion, "post-read-conversion");
7813   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7814   DEFSYM (Qdefault_char, "default-char");
7815   DEFSYM (Qundecided, "undecided");
7816   DEFSYM (Qno_conversion, "no-conversion");
7817   DEFSYM (Qraw_text, "raw-text");
7818
7819   DEFSYM (Qiso_2022, "iso-2022");
7820
7821   DEFSYM (Qutf_8, "utf-8");
7822
7823   DEFSYM (Qutf_16, "utf-16");
7824   DEFSYM (Qutf_16_be, "utf-16-be");
7825   DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7826   DEFSYM (Qutf_16_le, "utf-16-l3");
7827   DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7828   DEFSYM (Qsignature, "signature");
7829   DEFSYM (Qendian, "endian");
7830   DEFSYM (Qbig, "big");
7831   DEFSYM (Qlittle, "little");
7832
7833   DEFSYM (Qshift_jis, "shift-jis");
7834   DEFSYM (Qbig5, "big5");
7835
7836   DEFSYM (Qcoding_system_p, "coding-system-p");
7837
7838   DEFSYM (Qcoding_system_error, "coding-system-error");
7839   Fput (Qcoding_system_error, Qerror_conditions,
7840         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7841   Fput (Qcoding_system_error, Qerror_message,
7842         build_string ("Invalid coding system"));
7843
7844   /* Intern this now in case it isn't already done.
7845      Setting this variable twice is harmless.
7846      But don't staticpro it here--that is done in alloc.c.  */
7847   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7848
7849   DEFSYM (Qtranslation_table, "translation-table");
7850   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7851   DEFSYM (Qtranslation_table_id, "translation-table-id");
7852   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7853   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7854
7855   DEFSYM (Qchar_coding_system, "char-coding-system");
7856
7857   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7858
7859   DEFSYM (Qvalid_codes, "valid-codes");
7860
7861   DEFSYM (Qemacs_mule, "emacs-mule");
7862
7863   Vcoding_category_table
7864     = Fmake_vector (make_number (coding_category_max), Qnil);
7865   staticpro (&Vcoding_category_table);
7866   /* Followings are target of code detection.  */
7867   ASET (Vcoding_category_table, coding_category_iso_7,
7868         intern ("coding-category-iso-7"));
7869   ASET (Vcoding_category_table, coding_category_iso_7_tight,
7870         intern ("coding-category-iso-7-tight"));
7871   ASET (Vcoding_category_table, coding_category_iso_8_1,
7872         intern ("coding-category-iso-8-1"));
7873   ASET (Vcoding_category_table, coding_category_iso_8_2,
7874         intern ("coding-category-iso-8-2"));
7875   ASET (Vcoding_category_table, coding_category_iso_7_else,
7876         intern ("coding-category-iso-7-else"));
7877   ASET (Vcoding_category_table, coding_category_iso_8_else,
7878         intern ("coding-category-iso-8-else"));
7879   ASET (Vcoding_category_table, coding_category_utf_8,
7880         intern ("coding-category-utf-8"));
7881   ASET (Vcoding_category_table, coding_category_utf_16_be,
7882         intern ("coding-category-utf-16-be"));
7883   ASET (Vcoding_category_table, coding_category_utf_16_le,
7884         intern ("coding-category-utf-16-le"));
7885   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7886         intern ("coding-category-utf-16-be-nosig"));
7887   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7888         intern ("coding-category-utf-16-le-nosig"));
7889   ASET (Vcoding_category_table, coding_category_charset,
7890         intern ("coding-category-charset"));
7891   ASET (Vcoding_category_table, coding_category_sjis,
7892         intern ("coding-category-sjis"));
7893   ASET (Vcoding_category_table, coding_category_big5,
7894         intern ("coding-category-big5"));
7895   ASET (Vcoding_category_table, coding_category_ccl,
7896         intern ("coding-category-ccl"));
7897   ASET (Vcoding_category_table, coding_category_emacs_mule,
7898         intern ("coding-category-emacs-mule"));
7899   /* Followings are NOT target of code detection.  */
7900   ASET (Vcoding_category_table, coding_category_raw_text,
7901         intern ("coding-category-raw-text"));
7902   ASET (Vcoding_category_table, coding_category_undecided,
7903         intern ("coding-category-undecided"));
7904
7905   defsubr (&Scoding_system_p);
7906   defsubr (&Sread_coding_system);
7907   defsubr (&Sread_non_nil_coding_system);
7908   defsubr (&Scheck_coding_system);
7909   defsubr (&Sdetect_coding_region);
7910   defsubr (&Sdetect_coding_string);
7911   defsubr (&Sfind_coding_systems_region_internal);
7912   defsubr (&Scheck_coding_systems_region);
7913   defsubr (&Sdecode_coding_region);
7914   defsubr (&Sencode_coding_region);
7915   defsubr (&Sdecode_coding_string);
7916   defsubr (&Sencode_coding_string);
7917   defsubr (&Sdecode_sjis_char);
7918   defsubr (&Sencode_sjis_char);
7919   defsubr (&Sdecode_big5_char);
7920   defsubr (&Sencode_big5_char);
7921   defsubr (&Sset_terminal_coding_system_internal);
7922   defsubr (&Sset_safe_terminal_coding_system_internal);
7923   defsubr (&Sterminal_coding_system);
7924   defsubr (&Sset_keyboard_coding_system_internal);
7925   defsubr (&Skeyboard_coding_system);
7926   defsubr (&Sfind_operation_coding_system);
7927   defsubr (&Sset_coding_system_priority);
7928   defsubr (&Sdefine_coding_system_internal);
7929   defsubr (&Sdefine_coding_system_alias);
7930   defsubr (&Scoding_system_base);
7931   defsubr (&Scoding_system_plist);
7932   defsubr (&Scoding_system_aliases);
7933   defsubr (&Scoding_system_eol_type);
7934   defsubr (&Scoding_system_priority_list);
7935
7936   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7937                doc: /* List of coding systems.
7938
7939 Do not alter the value of this variable manually.  This variable should be
7940 updated by the functions `define-coding-system' and
7941 `define-coding-system-alias'.  */);
7942   Vcoding_system_list = Qnil;
7943
7944   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7945                doc: /* Alist of coding system names.
7946 Each element is one element list of coding system name.
7947 This variable is given to `completing-read' as TABLE argument.
7948
7949 Do not alter the value of this variable manually.  This variable should be
7950 updated by the functions `make-coding-system' and
7951 `define-coding-system-alias'.  */);
7952   Vcoding_system_alist = Qnil;
7953
7954   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7955                doc: /* List of coding-categories (symbols) ordered by priority.
7956
7957 On detecting a coding system, Emacs tries code detection algorithms
7958 associated with each coding-category one by one in this order.  When
7959 one algorithm agrees with a byte sequence of source text, the coding
7960 system bound to the corresponding coding-category is selected.  */);
7961   {
7962     int i;
7963
7964     Vcoding_category_list = Qnil;
7965     for (i = coding_category_max - 1; i >= 0; i--)
7966       Vcoding_category_list
7967         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7968                  Vcoding_category_list);
7969   }
7970
7971   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7972                doc: /* Specify the coding system for read operations.
7973 It is useful to bind this variable with `let', but do not set it globally.
7974 If the value is a coding system, it is used for decoding on read operation.
7975 If not, an appropriate element is used from one of the coding system alists:
7976 There are three such tables, `file-coding-system-alist',
7977 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7978   Vcoding_system_for_read = Qnil;
7979
7980   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7981                doc: /* Specify the coding system for write operations.
7982 Programs bind this variable with `let', but you should not set it globally.
7983 If the value is a coding system, it is used for encoding of output,
7984 when writing it to a file and when sending it to a file or subprocess.
7985
7986 If this does not specify a coding system, an appropriate element
7987 is used from one of the coding system alists:
7988 There are three such tables, `file-coding-system-alist',
7989 `process-coding-system-alist', and `network-coding-system-alist'.
7990 For output to files, if the above procedure does not specify a coding system,
7991 the value of `buffer-file-coding-system' is used.  */);
7992   Vcoding_system_for_write = Qnil;
7993
7994   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7995                doc: /*
7996 Coding system used in the latest file or process I/O.  */);
7997   Vlast_coding_system_used = Qnil;
7998
7999   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8000                doc: /*
8001 *Non-nil means always inhibit code conversion of end-of-line format.
8002 See info node `Coding Systems' and info node `Text and Binary' concerning
8003 such conversion.  */);
8004   inhibit_eol_conversion = 0;
8005
8006   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8007                doc: /*
8008 Non-nil means process buffer inherits coding system of process output.
8009 Bind it to t if the process output is to be treated as if it were a file
8010 read from some filesystem.  */);
8011   inherit_process_coding_system = 0;
8012
8013   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8014                doc: /*
8015 Alist to decide a coding system to use for a file I/O operation.
8016 The format is ((PATTERN . VAL) ...),
8017 where PATTERN is a regular expression matching a file name,
8018 VAL is a coding system, a cons of coding systems, or a function symbol.
8019 If VAL is a coding system, it is used for both decoding and encoding
8020 the file contents.
8021 If VAL is a cons of coding systems, the car part is used for decoding,
8022 and the cdr part is used for encoding.
8023 If VAL is a function symbol, the function must return a coding system
8024 or a cons of coding systems which are used as above.  The function gets
8025 the arguments with which `find-operation-coding-systems' was called.
8026
8027 See also the function `find-operation-coding-system'
8028 and the variable `auto-coding-alist'.  */);
8029   Vfile_coding_system_alist = Qnil;
8030
8031   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8032                doc: /*
8033 Alist to decide a coding system to use for a process I/O operation.
8034 The format is ((PATTERN . VAL) ...),
8035 where PATTERN is a regular expression matching a program name,
8036 VAL is a coding system, a cons of coding systems, or a function symbol.
8037 If VAL is a coding system, it is used for both decoding what received
8038 from the program and encoding what sent to the program.
8039 If VAL is a cons of coding systems, the car part is used for decoding,
8040 and the cdr part is used for encoding.
8041 If VAL is a function symbol, the function must return a coding system
8042 or a cons of coding systems which are used as above.
8043
8044 See also the function `find-operation-coding-system'.  */);
8045   Vprocess_coding_system_alist = Qnil;
8046
8047   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8048                doc: /*
8049 Alist to decide a coding system to use for a network I/O operation.
8050 The format is ((PATTERN . VAL) ...),
8051 where PATTERN is a regular expression matching a network service name
8052 or is a port number to connect to,
8053 VAL is a coding system, a cons of coding systems, or a function symbol.
8054 If VAL is a coding system, it is used for both decoding what received
8055 from the network stream and encoding what sent to the network stream.
8056 If VAL is a cons of coding systems, the car part is used for decoding,
8057 and the cdr part is used for encoding.
8058 If VAL is a function symbol, the function must return a coding system
8059 or a cons of coding systems which are used as above.
8060
8061 See also the function `find-operation-coding-system'.  */);
8062   Vnetwork_coding_system_alist = Qnil;
8063
8064   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8065                doc: /* Coding system to use with system messages.
8066 Also used for decoding keyboard input on X Window system.  */);
8067   Vlocale_coding_system = Qnil;
8068
8069   /* The eol mnemonics are reset in startup.el system-dependently.  */
8070   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8071                doc: /*
8072 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8073   eol_mnemonic_unix = build_string (":");
8074
8075   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8076                doc: /*
8077 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8078   eol_mnemonic_dos = build_string ("\\");
8079
8080   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8081                doc: /*
8082 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8083   eol_mnemonic_mac = build_string ("/");
8084
8085   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8086                doc: /*
8087 *String displayed in mode line when end-of-line format is not yet determined.  */);
8088   eol_mnemonic_undecided = build_string (":");
8089
8090   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8091                doc: /*
8092 *Non-nil enables character translation while encoding and decoding.  */);
8093   Venable_character_translation = Qt;
8094
8095   DEFVAR_LISP ("standard-translation-table-for-decode",
8096                &Vstandard_translation_table_for_decode,
8097                doc: /* Table for translating characters while decoding.  */);
8098   Vstandard_translation_table_for_decode = Qnil;
8099
8100   DEFVAR_LISP ("standard-translation-table-for-encode",
8101                &Vstandard_translation_table_for_encode,
8102                doc: /* Table for translating characters while encoding.  */);
8103   Vstandard_translation_table_for_encode = Qnil;
8104
8105   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8106                doc: /* Alist of charsets vs revision numbers.
8107 While encoding, if a charset (car part of an element) is found,
8108 designate it with the escape sequence identifying revision (cdr part
8109 of the element).  */);
8110   Vcharset_revision_table = Qnil;
8111
8112   DEFVAR_LISP ("default-process-coding-system",
8113                &Vdefault_process_coding_system,
8114                doc: /* Cons of coding systems used for process I/O by default.
8115 The car part is used for decoding a process output,
8116 the cdr part is used for encoding a text to be sent to a process.  */);
8117   Vdefault_process_coding_system = Qnil;
8118
8119   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8120                doc: /*
8121 Table of extra Latin codes in the range 128..159 (inclusive).
8122 This is a vector of length 256.
8123 If Nth element is non-nil, the existence of code N in a file
8124 \(or output of subprocess) doesn't prevent it to be detected as
8125 a coding system of ISO 2022 variant which has a flag
8126 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8127 or reading output of a subprocess.
8128 Only 128th through 159th elements has a meaning.  */);
8129   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8130
8131   DEFVAR_LISP ("select-safe-coding-system-function",
8132                &Vselect_safe_coding_system_function,
8133                doc: /*
8134 Function to call to select safe coding system for encoding a text.
8135
8136 If set, this function is called to force a user to select a proper
8137 coding system which can encode the text in the case that a default
8138 coding system used in each operation can't encode the text.
8139
8140 The default value is `select-safe-coding-system' (which see).  */);
8141   Vselect_safe_coding_system_function = Qnil;
8142
8143   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
8144                doc: /*
8145 Char-table containing safe coding systems of each characters.
8146 Each element doesn't include such generic coding systems that can
8147 encode any characters.   They are in the first extra slot.  */);
8148   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
8149
8150   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8151                &inhibit_iso_escape_detection,
8152                doc: /*
8153 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8154
8155 By default, on reading a file, Emacs tries to detect how the text is
8156 encoded.  This code detection is sensitive to escape sequences.  If
8157 the sequence is valid as ISO2022, the code is determined as one of
8158 the ISO2022 encodings, and the file is decoded by the corresponding
8159 coding system (e.g. `iso-2022-7bit').
8160
8161 However, there may be a case that you want to read escape sequences in
8162 a file as is.  In such a case, you can set this variable to non-nil.
8163 Then, as the code detection ignores any escape sequences, no file is
8164 detected as encoded in some ISO2022 encoding.  The result is that all
8165 escape sequences become visible in a buffer.
8166
8167 The default value is nil, and it is strongly recommended not to change
8168 it.  That is because many Emacs Lisp source files that contain
8169 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8170 in Emacs's distribution, and they won't be decoded correctly on
8171 reading if you suppress escape sequence detection.
8172
8173 The other way to read escape sequences in a file without decoding is
8174 to explicitly specify some coding system that doesn't use ISO2022's
8175 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8176   inhibit_iso_escape_detection = 0;
8177
8178   {
8179     Lisp_Object args[coding_arg_max];
8180     Lisp_Object plist[14];
8181     int i;
8182
8183     for (i = 0; i < coding_arg_max; i++)
8184       args[i] = Qnil;
8185
8186     plist[0] = intern (":name");
8187     plist[1] = args[coding_arg_name] = Qno_conversion;
8188     plist[2] = intern (":mnemonic");
8189     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8190     plist[4] = intern (":coding-type");
8191     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8192     plist[6] = intern (":ascii-compatible-p");
8193     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8194     plist[8] = intern (":default-char");
8195     plist[9] = args[coding_arg_default_char] = make_number (0);
8196     plist[10] = intern (":docstring");
8197     plist[11] = build_string ("Do no conversion.\n\
8198 \n\
8199 When you visit a file with this coding, the file is read into a\n\
8200 unibyte buffer as is, thus each byte of a file is treated as a\n\
8201 character.");
8202     plist[12] = intern (":eol-type");
8203     plist[13] = args[coding_arg_eol_type] = Qunix;
8204     args[coding_arg_plist] = Flist (14, plist);
8205     Fdefine_coding_system_internal (coding_arg_max, args);
8206   }
8207
8208   setup_coding_system (Qno_conversion, &keyboard_coding);
8209   setup_coding_system (Qno_conversion, &terminal_coding);
8210   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8211 }
8212
8213 char *
8214 emacs_strerror (error_number)
8215      int error_number;
8216 {
8217   char *str;
8218
8219   synchronize_system_messages_locale ();
8220   str = strerror (error_number);
8221
8222   if (! NILP (Vlocale_coding_system))
8223     {
8224       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8225                                                       Vlocale_coding_system,
8226                                                       0);
8227       str = (char *) XSTRING (dec)->data;
8228     }
8229
8230   return str;
8231 }
8232
8233 #endif /* emacs */