src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   Coding system is an object for a encoding mechanism that contains
  50   information about how to convert byte sequence to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from a
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the mechanism of encoding.  Here's a brief descrition about type.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by code converter defined for each
  75   character set.
  76
  77   o Old Emacs' internal format (emacs-mule)
  78
  79   The coding system adopted by an old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode a text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for a text containing raw eight-bit data.  Emacs
 112   treat each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How end-of-line of a text is encoded depends on a system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text characters encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g.  the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX.  Return 1 if the data contains
 148   a byte sequence which can be decoded into non-ASCII characters by
 149   the coding system.  Otherwize (i.e. the data contains only ASCII
 150   characters or invalid sequence) return 0.
 151
 152   It also resets some bits of an integer pointed by MASK.  The macros
 153   CATEGORY_MASK_XXX specifies each bit of this integer.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, mask)
 160      struct coding_system *coding;
 161      int *mask;
 162 {
 163   unsigned char *src = coding->source;
 164   unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int c;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175       /* Check if it conforms to XXX.  If not, break the loop.  */
 176     }
 177   /* As the data is invalid for XXX, reset a proper bits.  */
 178   *mask &= ~CODING_CATEGORY_XXX;
 179   return 0;
 180  no_more_source:
 181   /* The source exausted.  */
 182   if (!found)
 183     /* ASCII characters only. */
 184     return 0;
 185   /* Some data should be decoded into non-ASCII characters.  */
 186   *mask &= CODING_CATEGORY_XXX;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   unsigned char *src = coding->source + coding->consumed;
 211   unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf;
 218   int *charbuf_end = charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos, Qmac;
 307 Lisp_Object Qbuffer_file_coding_system;
 308 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 309 Lisp_Object Qdefault_char;
 310 Lisp_Object Qno_conversion, Qundecided;
 311 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 312 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
 313 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
 314 Lisp_Object Qcoding_system_history;
 315 Lisp_Object Qvalid_codes;
 316
 317 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 318 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 319 Lisp_Object Qstart_process, Qopen_network_stream;
 320 Lisp_Object Qtarget_idx;
 321
 322 Lisp_Object Vselect_safe_coding_system_function;
 323
 324 /* Mnemonic string for each format of end-of-line.  */
 325 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 326 /* Mnemonic string to indicate format of end-of-line is not yet
 327    decided.  */
 328 Lisp_Object eol_mnemonic_undecided;
 329
 330 #ifdef emacs
 331
 332 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 333
 334 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 335
 336 /* Coding system emacs-mule and raw-text are for converting only
 337    end-of-line format.  */
 338 Lisp_Object Qemacs_mule, Qraw_text;
 339
 340 /* Coding-systems are handed between Emacs Lisp programs and C internal
 341    routines by the following three variables.  */
 342 /* Coding-system for reading files and receiving data from process.  */
 343 Lisp_Object Vcoding_system_for_read;
 344 /* Coding-system for writing files and sending data to process.  */
 345 Lisp_Object Vcoding_system_for_write;
 346 /* Coding-system actually used in the latest I/O.  */
 347 Lisp_Object Vlast_coding_system_used;
 348
 349 /* A vector of length 256 which contains information about special
 350    Latin codes (especially for dealing with Microsoft codes).  */
 351 Lisp_Object Vlatin_extra_code_table;
 352
 353 /* Flag to inhibit code conversion of end-of-line format.  */
 354 int inhibit_eol_conversion;
 355
 356 /* Flag to inhibit ISO2022 escape sequence detection.  */
 357 int inhibit_iso_escape_detection;
 358
 359 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 360 int inherit_process_coding_system;
 361
 362 /* Coding system to be used to encode text for terminal display.  */
 363 struct coding_system terminal_coding;
 364
 365 /* Coding system to be used to encode text for terminal display when
 366    terminal coding system is nil.  */
 367 struct coding_system safe_terminal_coding;
 368
 369 /* Coding system of what is sent from terminal keyboard.  */
 370 struct coding_system keyboard_coding;
 371
 372 Lisp_Object Vfile_coding_system_alist;
 373 Lisp_Object Vprocess_coding_system_alist;
 374 Lisp_Object Vnetwork_coding_system_alist;
 375
 376 Lisp_Object Vlocale_coding_system;
 377
 378 #endif /* emacs */
 379
 380 /* Flag to tell if we look up translation table on character code
 381    conversion.  */
 382 Lisp_Object Venable_character_translation;
 383 /* Standard translation table to look up on decoding (reading).  */
 384 Lisp_Object Vstandard_translation_table_for_decode;
 385 /* Standard translation table to look up on encoding (writing).  */
 386 Lisp_Object Vstandard_translation_table_for_encode;
 387
 388 Lisp_Object Qtranslation_table;
 389 Lisp_Object Qtranslation_table_id;
 390 Lisp_Object Qtranslation_table_for_decode;
 391 Lisp_Object Qtranslation_table_for_encode;
 392
 393 /* Alist of charsets vs revision number.  */
 394 static Lisp_Object Vcharset_revision_table;
 395
 396 /* Default coding systems used for process I/O.  */
 397 Lisp_Object Vdefault_process_coding_system;
 398
 399 /* Global flag to tell that we can't call post-read-conversion and
 400    pre-write-conversion functions.  Usually the value is zero, but it
 401    is set to 1 temporarily while such functions are running.  This is
 402    to avoid infinite recursive call.  */
 403 static int inhibit_pre_post_conversion;
 404
 405 /* Char-table containing safe coding systems of each character.  */
 406 Lisp_Object Vchar_coding_system_table;
 407 Lisp_Object Qchar_coding_system;
 408
 409 /* Two special coding systems.  */
 410 Lisp_Object Vsjis_coding_system;
 411 Lisp_Object Vbig5_coding_system;
 412
 413
 414 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
 415 static void decode_coding_utf_8 P_ ((struct coding_system *));
 416 static int encode_coding_utf_8 P_ ((struct coding_system *));
 417
 418 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
 419 static void decode_coding_utf_16 P_ ((struct coding_system *));
 420 static int encode_coding_utf_16 P_ ((struct coding_system *));
 421
 422 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
 423 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 424 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 425
 426 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
 427 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 428 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 429
 430 static int detect_coding_sjis P_ ((struct coding_system *, int *));
 431 static void decode_coding_sjis P_ ((struct coding_system *));
 432 static int encode_coding_sjis P_ ((struct coding_system *));
 433
 434 static int detect_coding_big5 P_ ((struct coding_system *, int *));
 435 static void decode_coding_big5 P_ ((struct coding_system *));
 436 static int encode_coding_big5 P_ ((struct coding_system *));
 437
 438 static int detect_coding_ccl P_ ((struct coding_system *, int *));
 439 static void decode_coding_ccl P_ ((struct coding_system *));
 440 static int encode_coding_ccl P_ ((struct coding_system *));
 441
 442 static void decode_coding_raw_text P_ ((struct coding_system *));
 443 static int encode_coding_raw_text P_ ((struct coding_system *));
 444
 445
 446 /* ISO2022 section */
 447
 448 #define CODING_ISO_INITIAL(coding, reg)                 \
 449   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 450                      coding_attr_iso_initial),          \
 451                reg)))
 452
 453
 454 #define CODING_ISO_REQUEST(coding, charset_id)  \
 455   ((charset_id <= (coding)->max_charset_id      \
 456     ? (coding)->safe_charsets[charset_id]       \
 457     : -1))
 458
 459
 460 #define CODING_ISO_FLAGS(coding)        \
 461   ((coding)->spec.iso_2022.flags)
 462 #define CODING_ISO_DESIGNATION(coding, reg)     \
 463   ((coding)->spec.iso_2022.current_designation[reg])
 464 #define CODING_ISO_INVOCATION(coding, plane)    \
 465   ((coding)->spec.iso_2022.current_invocation[plane])
 466 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 467   ((coding)->spec.iso_2022.single_shifting)
 468 #define CODING_ISO_BOL(coding)  \
 469   ((coding)->spec.iso_2022.bol)
 470 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 471   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 472
 473 /* Control characters of ISO2022.  */
 474                         /* code */      /* function */
 475 #define ISO_CODE_LF     0x0A            /* line-feed */
 476 #define ISO_CODE_CR     0x0D            /* carriage-return */
 477 #define ISO_CODE_SO     0x0E            /* shift-out */
 478 #define ISO_CODE_SI     0x0F            /* shift-in */
 479 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 480 #define ISO_CODE_ESC    0x1B            /* escape */
 481 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 482 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 483 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 484
 485 /* All code (1-byte) of ISO2022 is classified into one of the
 486    followings.  */
 487 enum iso_code_class_type
 488   {
 489     ISO_control_0,              /* Control codes in the range
 490                                    0x00..0x1F and 0x7F, except for the
 491                                    following 5 codes.  */
 492     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 493     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 494     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 495     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 496     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 497     ISO_control_1,              /* Control codes in the range
 498                                    0x80..0x9F, except for the
 499                                    following 3 codes.  */
 500     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 501     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 502     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 503     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 504     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 505     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 506     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 507   };
 508
 509 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 510     `iso-flags' attribute of an iso2022 coding system.  */
 511
 512 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 513    instead of the correct short-form sequence (e.g. ESC $ A).  */
 514 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 515
 516 /* If set, reset graphic planes and registers at end-of-line to the
 517    initial state.  */
 518 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 519
 520 /* If set, reset graphic planes and registers before any control
 521    characters to the initial state.  */
 522 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 523
 524 /* If set, encode by 7-bit environment.  */
 525 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 526
 527 /* If set, use locking-shift function.  */
 528 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 529
 530 /* If set, use single-shift function.  Overwrite
 531    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 532 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 533
 534 /* If set, use designation escape sequence.  */
 535 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 536
 537 /* If set, produce revision number sequence.  */
 538 #define CODING_ISO_FLAG_REVISION        0x0080
 539
 540 /* If set, produce ISO6429's direction specifying sequence.  */
 541 #define CODING_ISO_FLAG_DIRECTION       0x0100
 542
 543 /* If set, assume designation states are reset at beginning of line on
 544    output.  */
 545 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 546
 547 /* If set, designation sequence should be placed at beginning of line
 548    on output.  */
 549 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 550
 551 /* If set, do not encode unsafe charactes on output.  */
 552 #define CODING_ISO_FLAG_SAFE            0x0800
 553
 554 /* If set, extra latin codes (128..159) are accepted as a valid code
 555    on input.  */
 556 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 557
 558 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 559
 560 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 561
 562 #define CODING_ISO_FLAG_FULL_SUPPORT    0x8000
 563
 564 /* A character to be produced on output if encoding of the original
 565    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 566 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 567
 568
 569 /* UTF-16 section */
 570 #define CODING_UTF_16_BOM(coding)       \
 571   ((coding)->spec.utf_16.bom)
 572
 573 #define CODING_UTF_16_ENDIAN(coding)    \
 574   ((coding)->spec.utf_16.endian)
 575
 576 #define CODING_UTF_16_SURROGATE(coding) \
 577   ((coding)->spec.utf_16.surrogate)
 578
 579
 580 /* CCL section */
 581 #define CODING_CCL_DECODER(coding)      \
 582   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 583 #define CODING_CCL_ENCODER(coding)      \
 584   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 585 #define CODING_CCL_VALIDS(coding)                                          \
 586   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 587    ->data)
 588
 589 /* Index for each coding category in `coding_category_table' */
 590
 591 enum coding_category
 592   {
 593     coding_category_iso_7,
 594     coding_category_iso_7_tight,
 595     coding_category_iso_8_1,
 596     coding_category_iso_8_2,
 597     coding_category_iso_7_else,
 598     coding_category_iso_8_else,
 599     coding_category_utf_8,
 600     coding_category_utf_16_auto,
 601     coding_category_utf_16_be,
 602     coding_category_utf_16_le,
 603     coding_category_utf_16_be_nosig,
 604     coding_category_utf_16_le_nosig,
 605     coding_category_charset,
 606     coding_category_sjis,
 607     coding_category_big5,
 608     coding_category_ccl,
 609     coding_category_emacs_mule,
 610     /* All above are targets of code detection.  */
 611     coding_category_raw_text,
 612     coding_category_undecided,
 613     coding_category_max
 614   };
 615
 616 /* Definitions of flag bits used in detect_coding_XXXX.  */
 617 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 618 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 619 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 620 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 621 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 622 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 623 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 624 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 625 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 626 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 627 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 628 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 629 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 630 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 631 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 632 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 633
 634 /* This value is returned if detect_coding_mask () find nothing other
 635    than ASCII characters.  */
 636 #define CATEGORY_MASK_ANY               \
 637   (CATEGORY_MASK_ISO_7                  \
 638    | CATEGORY_MASK_ISO_7_TIGHT          \
 639    | CATEGORY_MASK_ISO_8_1              \
 640    | CATEGORY_MASK_ISO_8_2              \
 641    | CATEGORY_MASK_ISO_7_ELSE           \
 642    | CATEGORY_MASK_ISO_8_ELSE           \
 643    | CATEGORY_MASK_UTF_8                \
 644    | CATEGORY_MASK_UTF_16_BE            \
 645    | CATEGORY_MASK_UTF_16_LE            \
 646    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 647    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 648    | CATEGORY_MASK_CHARSET              \
 649    | CATEGORY_MASK_SJIS                 \
 650    | CATEGORY_MASK_BIG5                 \
 651    | CATEGORY_MASK_CCL                  \
 652    | CATEGORY_MASK_EMACS_MULE)
 653
 654
 655 #define CATEGORY_MASK_ISO_7BIT \
 656   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 657
 658 #define CATEGORY_MASK_ISO_8BIT \
 659   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 660
 661 #define CATEGORY_MASK_ISO_ELSE \
 662   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 663
 664 #define CATEGORY_MASK_ISO_ESCAPE        \
 665   (CATEGORY_MASK_ISO_7                  \
 666    | CATEGORY_MASK_ISO_7_TIGHT          \
 667    | CATEGORY_MASK_ISO_7_ELSE           \
 668    | CATEGORY_MASK_ISO_8_ELSE)
 669
 670 #define CATEGORY_MASK_ISO       \
 671   (  CATEGORY_MASK_ISO_7BIT     \
 672      | CATEGORY_MASK_ISO_8BIT   \
 673      | CATEGORY_MASK_ISO_ELSE)
 674
 675 #define CATEGORY_MASK_UTF_16            \
 676   (CATEGORY_MASK_UTF_16_BE              \
 677    | CATEGORY_MASK_UTF_16_LE            \
 678    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 679    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 680
 681
 682 /* List of symbols `coding-category-xxx' ordered by priority.  This
 683    variable is exposed to Emacs Lisp.  */
 684 static Lisp_Object Vcoding_category_list;
 685
 686 /* Table of coding categories (Lisp symbols).  This variable is for
 687    internal use oly.  */
 688 static Lisp_Object Vcoding_category_table;
 689
 690 /* Table of coding-categories ordered by priority.  */
 691 static enum coding_category coding_priorities[coding_category_max];
 692
 693 /* Nth element is a coding context for the coding system bound to the
 694    Nth coding category.  */
 695 static struct coding_system coding_categories[coding_category_max];
 696
 697 static int detected_mask[coding_category_raw_text] =
 698   { CATEGORY_MASK_ISO,
 699     CATEGORY_MASK_ISO,
 700     CATEGORY_MASK_ISO,
 701     CATEGORY_MASK_ISO,
 702     CATEGORY_MASK_ISO,
 703     CATEGORY_MASK_ISO,
 704     CATEGORY_MASK_UTF_8,
 705     CATEGORY_MASK_UTF_16,
 706     CATEGORY_MASK_UTF_16,
 707     CATEGORY_MASK_UTF_16,
 708     CATEGORY_MASK_UTF_16,
 709     CATEGORY_MASK_UTF_16,
 710     CATEGORY_MASK_CHARSET,
 711     CATEGORY_MASK_SJIS,
 712     CATEGORY_MASK_BIG5,
 713     CATEGORY_MASK_CCL,
 714     CATEGORY_MASK_EMACS_MULE
 715   };
 716
 717 /*** Commonly used macros and functions ***/
 718
 719 #ifndef min
 720 #define min(a, b) ((a) < (b) ? (a) : (b))
 721 #endif
 722 #ifndef max
 723 #define max(a, b) ((a) > (b) ? (a) : (b))
 724 #endif
 725
 726 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 727   do {                                                          \
 728     attrs = CODING_ID_ATTRS (coding->id);                       \
 729     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 730     if (VECTORP (eol_type))                                     \
 731       eol_type = Qunix;                                         \
 732     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 733   } while (0)
 734
 735
 736 /* Safely get one byte from the source text pointed by SRC which ends
 737    at SRC_END, and set C to that byte.  If there are not enough bytes
 738    in the source, it jumps to `no_more_source'.  The caller
 739    should declare and set these variables appropriately in advance:
 740         src, src_end, multibytep
 741 */
 742
 743 #define ONE_MORE_BYTE(c)                                        \
 744   do {                                                          \
 745     if (src == src_end)                                         \
 746       {                                                         \
 747         if (src_base < src)                                     \
 748           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 749         goto no_more_source;                                    \
 750       }                                                         \
 751     c = *src++;                                                 \
 752     if (multibytep && (c & 0x80))                               \
 753       {                                                         \
 754         if ((c & 0xFE) != 0xC0)                                 \
 755           error ("Undecodable char found");                     \
 756         c = ((c & 1) << 6) | *src++;                            \
 757       }                                                         \
 758     consumed_chars++;                                           \
 759   } while (0)
 760
 761
 762 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 763   do {                                          \
 764     c = *src++;                                 \
 765     if (multibytep && (c & 0x80))               \
 766       {                                         \
 767         if ((c & 0xFE) != 0xC0)                 \
 768           error ("Undecodable char found");     \
 769         c = ((c & 1) << 6) | *src++;            \
 770       }                                         \
 771   } while (0)
 772
 773
 774 /* Store a byte C in the place pointed by DST and increment DST to the
 775    next free point, and increment PRODUCED_CHARS.  The caller should
 776    assure that C is 0..127, and declare and set the variable `dst'
 777    appropriately in advance.
 778 */
 779
 780
 781 #define EMIT_ONE_ASCII_BYTE(c)  \
 782   do {                          \
 783     produced_chars++;           \
 784     *dst++ = (c);               \
 785   } while (0)
 786
 787
 788 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 789
 790 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 791   do {                                  \
 792     produced_chars += 2;                \
 793     *dst++ = (c1), *dst++ = (c2);       \
 794   } while (0)
 795
 796
 797 /* Store a byte C in the place pointed by DST and increment DST to the
 798    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 799    nonzero, store in an appropriate multibyte from.  The caller should
 800    declare and set the variables `dst' and `multibytep' appropriately
 801    in advance.  */
 802
 803 #define EMIT_ONE_BYTE(c)                \
 804   do {                                  \
 805     produced_chars++;                   \
 806     if (multibytep)                     \
 807       {                                 \
 808         int ch = (c);                   \
 809         if (ch >= 0x80)                 \
 810           ch = BYTE8_TO_CHAR (ch);      \
 811         CHAR_STRING_ADVANCE (ch, dst);  \
 812       }                                 \
 813     else                                \
 814       *dst++ = (c);                     \
 815   } while (0)
 816
 817
 818 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 819
 820 #define EMIT_TWO_BYTES(c1, c2)          \
 821   do {                                  \
 822     produced_chars += 2;                \
 823     if (multibytep)                     \
 824       {                                 \
 825         int ch;                         \
 826                                         \
 827         ch = (c1);                      \
 828         if (ch >= 0x80)                 \
 829           ch = BYTE8_TO_CHAR (ch);      \
 830         CHAR_STRING_ADVANCE (ch, dst);  \
 831         ch = (c2);                      \
 832         if (ch >= 0x80)                 \
 833           ch = BYTE8_TO_CHAR (ch);      \
 834         CHAR_STRING_ADVANCE (ch, dst);  \
 835       }                                 \
 836     else                                \
 837       {                                 \
 838         *dst++ = (c1);                  \
 839         *dst++ = (c2);                  \
 840       }                                 \
 841   } while (0)
 842
 843
 844 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 845   do {                                  \
 846     EMIT_ONE_BYTE (c1);                 \
 847     EMIT_TWO_BYTES (c2, c3);            \
 848   } while (0)
 849
 850
 851 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 852   do {                                          \
 853     EMIT_TWO_BYTES (c1, c2);                    \
 854     EMIT_TWO_BYTES (c3, c4);                    \
 855   } while (0)
 856
 857
 858 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 859   do {                                                                       \
 860     charset_map_loaded = 0;                                                  \
 861     c = DECODE_CHAR (charset, code);                                         \
 862     if (charset_map_loaded)                                                  \
 863       {                                                                      \
 864         unsigned char *orig = coding->source;                                \
 865         EMACS_INT offset;                                                    \
 866                                                                              \
 867         coding_set_source (coding);                                          \
 868         offset = coding->source - orig;                                      \
 869         src += offset;                                                       \
 870         src_base += offset;                                                  \
 871         src_end += offset;                                                   \
 872       }                                                                      \
 873   } while (0)
 874
 875
 876 #define ASSURE_DESTINATION(bytes)                               \
 877   do {                                                          \
 878     if (dst + (bytes) >= dst_end)                               \
 879       {                                                         \
 880         int more_bytes = charbuf_end - charbuf + (bytes);       \
 881                                                                 \
 882         dst = alloc_destination (coding, more_bytes, dst);      \
 883         dst_end = coding->destination + coding->dst_bytes;      \
 884       }                                                         \
 885   } while (0)
 886
 887
 888
 889 static void
 890 coding_set_source (coding)
 891      struct coding_system *coding;
 892 {
 893   if (BUFFERP (coding->src_object))
 894     {
 895       if (coding->src_pos < 0)
 896         coding->source = GAP_END_ADDR + coding->src_pos_byte;
 897       else
 898         {
 899           struct buffer *buf = XBUFFER (coding->src_object);
 900           EMACS_INT beg_byte = BUF_BEG_BYTE (buf);
 901           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
 902           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
 903
 904           coding->source = beg_addr + coding->src_pos_byte - 1;
 905           if (coding->src_pos_byte >= gpt_byte)
 906             coding->source += BUF_GAP_SIZE (buf);
 907         }
 908     }
 909   else if (STRINGP (coding->src_object))
 910     {
 911       coding->source = (XSTRING (coding->src_object)->data
 912                         + coding->src_pos_byte);
 913     }
 914   else
 915     /* Otherwise, the source is C string and is never relocated
 916        automatically.  Thus we don't have to update anything.  */
 917     ;
 918 }
 919
 920 static void
 921 coding_set_destination (coding)
 922      struct coding_system *coding;
 923 {
 924   if (BUFFERP (coding->dst_object))
 925     {
 926       /* We are sure that coding->dst_pos_byte is before the gap of the
 927          buffer. */
 928       coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 929                              + coding->dst_pos_byte - 1);
 930       if (coding->src_pos < 0)
 931         coding->dst_bytes = (GAP_END_ADDR
 932                              - (coding->src_bytes - coding->consumed)
 933                              - coding->destination);
 934       else
 935         coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 936                              - coding->destination);
 937     }
 938   else
 939     /* Otherwise, the destination is C string and is never relocated
 940        automatically.  Thus we don't have to update anything.  */
 941     ;
 942 }
 943
 944
 945 static void
 946 coding_alloc_by_realloc (coding, bytes)
 947      struct coding_system *coding;
 948      EMACS_INT bytes;
 949 {
 950   coding->destination = (unsigned char *) xrealloc (coding->destination,
 951                                                     coding->dst_bytes + bytes);
 952   coding->dst_bytes += bytes;
 953 }
 954
 955 static void
 956 coding_alloc_by_making_gap (coding, bytes)
 957      struct coding_system *coding;
 958      EMACS_INT bytes;
 959 {
 960   if (BUFFERP (coding->dst_object)
 961       && EQ (coding->src_object, coding->dst_object))
 962     {
 963       EMACS_INT add = coding->src_bytes - coding->consumed;
 964
 965       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 966       make_gap (bytes);
 967       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 968     }
 969   else
 970     {
 971       Lisp_Object this_buffer;
 972
 973       this_buffer = Fcurrent_buffer ();
 974       set_buffer_internal (XBUFFER (coding->dst_object));
 975       make_gap (bytes);
 976       set_buffer_internal (XBUFFER (this_buffer));
 977     }
 978 }
 979
 980
 981 static unsigned char *
 982 alloc_destination (coding, nbytes, dst)
 983      struct coding_system *coding;
 984      int nbytes;
 985      unsigned char *dst;
 986 {
 987   EMACS_INT offset = dst - coding->destination;
 988
 989   if (BUFFERP (coding->dst_object))
 990     coding_alloc_by_making_gap (coding, nbytes);
 991   else
 992     coding_alloc_by_realloc (coding, nbytes);
 993   coding->result = CODING_RESULT_SUCCESS;
 994   coding_set_destination (coding);
 995   dst = coding->destination + offset;
 996   return dst;
 997 }
 998
 999 \f
1000 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1001
1002
1003
1004 \f
1005 /*** 3. UTF-8 ***/
1006
1007 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1008    Check if a text is encoded in UTF-8.  If it is, return
1009    CATEGORY_MASK_UTF_8, else return 0.  */
1010
1011 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1012 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1013 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1014 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1015 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1016 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1017
1018 static int
1019 detect_coding_utf_8 (coding, mask)
1020      struct coding_system *coding;
1021      int *mask;
1022 {
1023   unsigned char *src = coding->source, *src_base = src;
1024   unsigned char *src_end = coding->source + coding->src_bytes;
1025   int multibytep = coding->src_multibyte;
1026   int consumed_chars = 0;
1027   int found = 0;
1028
1029   /* A coding system of this category is always ASCII compatible.  */
1030   src += coding->head_ascii;
1031
1032   while (1)
1033     {
1034       int c, c1, c2, c3, c4;
1035
1036       ONE_MORE_BYTE (c);
1037       if (UTF_8_1_OCTET_P (c))
1038         continue;
1039       ONE_MORE_BYTE (c1);
1040       if (! UTF_8_EXTRA_OCTET_P (c1))
1041         break;
1042       if (UTF_8_2_OCTET_LEADING_P (c))
1043         {
1044           found++;
1045           continue;
1046         }
1047       ONE_MORE_BYTE (c2);
1048       if (! UTF_8_EXTRA_OCTET_P (c2))
1049         break;
1050       if (UTF_8_3_OCTET_LEADING_P (c))
1051         {
1052           found++;
1053           continue;
1054         }
1055       ONE_MORE_BYTE (c3);
1056       if (! UTF_8_EXTRA_OCTET_P (c3))
1057         break;
1058       if (UTF_8_4_OCTET_LEADING_P (c))
1059         {
1060           found++;
1061           continue;
1062         }
1063       ONE_MORE_BYTE (c4);
1064       if (! UTF_8_EXTRA_OCTET_P (c4))
1065         break;
1066       if (UTF_8_5_OCTET_LEADING_P (c))
1067         {
1068           found++;
1069           continue;
1070         }
1071       break;
1072     }
1073   *mask &= ~CATEGORY_MASK_UTF_8;
1074   return 0;
1075
1076  no_more_source:
1077   if (! found)
1078     return 0;
1079   *mask &= CATEGORY_MASK_UTF_8;
1080   return 1;
1081 }
1082
1083
1084 static void
1085 decode_coding_utf_8 (coding)
1086      struct coding_system *coding;
1087 {
1088   unsigned char *src = coding->source + coding->consumed;
1089   unsigned char *src_end = coding->source + coding->src_bytes;
1090   unsigned char *src_base;
1091   int *charbuf = coding->charbuf;
1092   int *charbuf_end = charbuf + coding->charbuf_size;
1093   int consumed_chars = 0, consumed_chars_base;
1094   int multibytep = coding->src_multibyte;
1095   Lisp_Object attr, eol_type, charset_list;
1096
1097   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1098
1099   while (1)
1100     {
1101       int c, c1, c2, c3, c4, c5;
1102
1103       src_base = src;
1104       consumed_chars_base = consumed_chars;
1105
1106       if (charbuf >= charbuf_end)
1107         break;
1108
1109       ONE_MORE_BYTE (c1);
1110       if (UTF_8_1_OCTET_P(c1))
1111         {
1112           c = c1;
1113           if (c == '\r')
1114             {
1115               if (EQ (eol_type, Qdos))
1116                 {
1117                   if (src == src_end)
1118                     goto no_more_source;
1119                   if (*src == '\n')
1120                     ONE_MORE_BYTE (c);
1121                 }
1122               else if (EQ (eol_type, Qmac))
1123                 c = '\n';
1124             }
1125         }
1126       else
1127         {
1128           ONE_MORE_BYTE (c2);
1129           if (! UTF_8_EXTRA_OCTET_P (c2))
1130             goto invalid_code;
1131           if (UTF_8_2_OCTET_LEADING_P (c1))
1132             c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1133           else
1134             {
1135               ONE_MORE_BYTE (c3);
1136               if (! UTF_8_EXTRA_OCTET_P (c3))
1137                 goto invalid_code;
1138               if (UTF_8_3_OCTET_LEADING_P (c1))
1139                 c = (((c1 & 0xF) << 12)
1140                      | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1141               else
1142                 {
1143                   ONE_MORE_BYTE (c4);
1144                   if (! UTF_8_EXTRA_OCTET_P (c4))
1145                     goto invalid_code;
1146                   if (UTF_8_4_OCTET_LEADING_P (c1))
1147                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1148                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1149                   else
1150                     {
1151                       ONE_MORE_BYTE (c5);
1152                       if (! UTF_8_EXTRA_OCTET_P (c5))
1153                         goto invalid_code;
1154                       if (UTF_8_5_OCTET_LEADING_P (c1))
1155                         {
1156                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1157                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1158                                | (c5 & 0x3F));
1159                           if (c > MAX_CHAR)
1160                             goto invalid_code;
1161                         }
1162                       else
1163                         goto invalid_code;
1164                     }
1165                 }
1166             }
1167         }
1168
1169       *charbuf++ = c;
1170       continue;
1171
1172     invalid_code:
1173       src = src_base;
1174       consumed_chars = consumed_chars_base;
1175       ONE_MORE_BYTE (c);
1176       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1177       coding->errors++;
1178     }
1179
1180  no_more_source:
1181   coding->consumed_char += consumed_chars_base;
1182   coding->consumed = src_base - coding->source;
1183   coding->charbuf_used = charbuf - coding->charbuf;
1184 }
1185
1186
1187 static int
1188 encode_coding_utf_8 (coding)
1189      struct coding_system *coding;
1190 {
1191   int multibytep = coding->dst_multibyte;
1192   int *charbuf = coding->charbuf;
1193   int *charbuf_end = charbuf + coding->charbuf_used;
1194   unsigned char *dst = coding->destination + coding->produced;
1195   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1196   int produced_chars = 0;
1197   int c;
1198
1199   if (multibytep)
1200     {
1201       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1202
1203       while (charbuf < charbuf_end)
1204         {
1205           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1206
1207           ASSURE_DESTINATION (safe_room);
1208           c = *charbuf++;
1209           CHAR_STRING_ADVANCE (c, pend);
1210           for (p = str; p < pend; p++)
1211             EMIT_ONE_BYTE (*p);
1212         }
1213     }
1214   else
1215     {
1216       int safe_room = MAX_MULTIBYTE_LENGTH;
1217
1218       while (charbuf < charbuf_end)
1219         {
1220           ASSURE_DESTINATION (safe_room);
1221           c = *charbuf++;
1222           dst += CHAR_STRING (c, dst);
1223           produced_chars++;
1224         }
1225     }
1226   coding->result = CODING_RESULT_SUCCESS;
1227   coding->produced_char += produced_chars;
1228   coding->produced = dst - coding->destination;
1229   return 0;
1230 }
1231
1232
1233 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1234    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1235    Little Endian (otherwise).  If it is, return
1236    CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1237    else return 0.  */
1238
1239 #define UTF_16_HIGH_SURROGATE_P(val) \
1240   (((val) & 0xFC00) == 0xD800)
1241
1242 #define UTF_16_LOW_SURROGATE_P(val) \
1243   (((val) & 0xFC00) == 0xDC00)
1244
1245 #define UTF_16_INVALID_P(val)   \
1246   (((val) == 0xFFFE)            \
1247    || ((val) == 0xFFFF)         \
1248    || UTF_16_LOW_SURROGATE_P (val))
1249
1250
1251 static int
1252 detect_coding_utf_16 (coding, mask)
1253      struct coding_system *coding;
1254      int *mask;
1255 {
1256   unsigned char *src = coding->source, *src_base = src;
1257   unsigned char *src_end = coding->source + coding->src_bytes;
1258   int multibytep = coding->src_multibyte;
1259   int consumed_chars = 0;
1260   int c1, c2;
1261
1262   ONE_MORE_BYTE (c1);
1263   ONE_MORE_BYTE (c2);
1264
1265   if ((c1 == 0xFF) && (c2 == 0xFE))
1266     {
1267       *mask &= CATEGORY_MASK_UTF_16_LE;
1268       return 1;
1269     }
1270   else if ((c1 == 0xFE) && (c2 == 0xFF))
1271     {
1272       *mask &= CATEGORY_MASK_UTF_16_BE;
1273       return 1;
1274     }
1275  no_more_source:
1276   return 0;
1277 }
1278
1279 static void
1280 decode_coding_utf_16 (coding)
1281      struct coding_system *coding;
1282 {
1283   unsigned char *src = coding->source + coding->consumed;
1284   unsigned char *src_end = coding->source + coding->src_bytes;
1285   unsigned char *src_base;
1286   int *charbuf = coding->charbuf;
1287   int *charbuf_end = charbuf + coding->charbuf_size;
1288   int consumed_chars = 0, consumed_chars_base;
1289   int multibytep = coding->src_multibyte;
1290   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1291   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1292   int surrogate = CODING_UTF_16_SURROGATE (coding);
1293   Lisp_Object attr, eol_type, charset_list;
1294
1295   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1296
1297   if (bom != utf_16_without_bom)
1298     {
1299       int c, c1, c2;
1300
1301       src_base = src;
1302       ONE_MORE_BYTE (c1);
1303       ONE_MORE_BYTE (c2);
1304       c = (c1 << 8) | c2;
1305       if (bom == utf_16_with_bom)
1306         {
1307           if (endian == utf_16_big_endian
1308               ? c != 0xFFFE : c != 0xFEFF)
1309             {
1310               /* We are sure that there's enouph room at CHARBUF.  */
1311               *charbuf++ = c1;
1312               *charbuf++ = c2;
1313               coding->errors++;
1314             }
1315         }
1316       else
1317         {
1318           if (c == 0xFFFE)
1319             CODING_UTF_16_ENDIAN (coding)
1320               = endian = utf_16_big_endian;
1321           else if (c == 0xFEFF)
1322             CODING_UTF_16_ENDIAN (coding)
1323               = endian = utf_16_little_endian;
1324           else
1325             {
1326               CODING_UTF_16_ENDIAN (coding)
1327                 = endian = utf_16_big_endian;
1328               src = src_base;
1329             }
1330         }
1331       CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1332     }
1333
1334   while (1)
1335     {
1336       int c, c1, c2;
1337
1338       src_base = src;
1339       consumed_chars_base = consumed_chars;
1340
1341       if (charbuf + 2 >= charbuf_end)
1342         break;
1343
1344       ONE_MORE_BYTE (c1);
1345       ONE_MORE_BYTE (c2);
1346       c = (endian == utf_16_big_endian
1347            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1348       if (surrogate)
1349         {
1350           if (! UTF_16_LOW_SURROGATE_P (c))
1351             {
1352               if (endian == utf_16_big_endian)
1353                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1354               else
1355                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1356               *charbuf++ = c1;
1357               *charbuf++ = c2;
1358               coding->errors++;
1359               if (UTF_16_HIGH_SURROGATE_P (c))
1360                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1361               else
1362                 *charbuf++ = c;
1363             }
1364           else
1365             {
1366               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1367               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1368               *charbuf++ = c;
1369             }
1370         }
1371       else
1372         {
1373           if (UTF_16_HIGH_SURROGATE_P (c))
1374             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1375           else
1376             *charbuf++ = c;
1377         }
1378     }
1379
1380  no_more_source:
1381   coding->consumed_char += consumed_chars_base;
1382   coding->consumed = src_base - coding->source;
1383   coding->charbuf_used = charbuf - coding->charbuf;
1384 }
1385
1386 static int
1387 encode_coding_utf_16 (coding)
1388      struct coding_system *coding;
1389 {
1390   int multibytep = coding->dst_multibyte;
1391   int *charbuf = coding->charbuf;
1392   int *charbuf_end = charbuf + coding->charbuf_used;
1393   unsigned char *dst = coding->destination + coding->produced;
1394   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1395   int safe_room = 8;
1396   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1397   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1398   int produced_chars = 0;
1399   Lisp_Object attrs, eol_type, charset_list;
1400   int c;
1401
1402   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1403
1404   if (bom == utf_16_with_bom)
1405     {
1406       ASSURE_DESTINATION (safe_room);
1407       if (big_endian)
1408         EMIT_TWO_BYTES (0xFF, 0xFE);
1409       else
1410         EMIT_TWO_BYTES (0xFE, 0xFF);
1411       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1412     }
1413
1414   while (charbuf < charbuf_end)
1415     {
1416       ASSURE_DESTINATION (safe_room);
1417       c = *charbuf++;
1418       if (c >= MAX_UNICODE_CHAR)
1419         c = coding->default_char;
1420
1421       if (c < 0x10000)
1422         {
1423           if (big_endian)
1424             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1425           else
1426             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1427         }
1428       else
1429         {
1430           int c1, c2;
1431
1432           c -= 0x10000;
1433           c1 = (c >> 10) + 0xD800;
1434           c2 = (c & 0x3FF) + 0xDC00;
1435           if (big_endian)
1436             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1437           else
1438             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1439         }
1440     }
1441   coding->result = CODING_RESULT_SUCCESS;
1442   coding->produced = dst - coding->destination;
1443   coding->produced_char += produced_chars;
1444   return 0;
1445 }
1446
1447 \f
1448 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1449
1450 /* Emacs' internal format for representation of multiple character
1451    sets is a kind of multi-byte encoding, i.e. characters are
1452    represented by variable-length sequences of one-byte codes.
1453
1454    ASCII characters and control characters (e.g. `tab', `newline') are
1455    represented by one-byte sequences which are their ASCII codes, in
1456    the range 0x00 through 0x7F.
1457
1458    8-bit characters of the range 0x80..0x9F are represented by
1459    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1460    code + 0x20).
1461
1462    8-bit characters of the range 0xA0..0xFF are represented by
1463    one-byte sequences which are their 8-bit code.
1464
1465    The other characters are represented by a sequence of `base
1466    leading-code', optional `extended leading-code', and one or two
1467    `position-code's.  The length of the sequence is determined by the
1468    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1469    whereas extended leading-code and position-code take the range 0xA0
1470    through 0xFF.  See `charset.h' for more details about leading-code
1471    and position-code.
1472
1473    --- CODE RANGE of Emacs' internal format ---
1474    character set        range
1475    -------------        -----
1476    ascii                0x00..0x7F
1477    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1478    eight-bit-graphic    0xA0..0xBF
1479    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1480    ---------------------------------------------
1481
1482    As this is the internal character representation, the format is
1483    usually not used externally (i.e. in a file or in a data sent to a
1484    process).  But, it is possible to have a text externally in this
1485    format (i.e. by encoding by the coding system `emacs-mule').
1486
1487    In that case, a sequence of one-byte codes has a slightly different
1488    form.
1489
1490    At first, all characters in eight-bit-control are represented by
1491    one-byte sequences which are their 8-bit code.
1492
1493    Next, character composition data are represented by the byte
1494    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1495    where,
1496         METHOD is 0xF0 plus one of composition method (enum
1497         composition_method),
1498
1499         BYTES is 0xA0 plus a byte length of this composition data,
1500
1501         CHARS is 0x20 plus a number of characters composed by this
1502         data,
1503
1504         COMPONENTs are characters of multibye form or composition
1505         rules encoded by two-byte of ASCII codes.
1506
1507    In addition, for backward compatibility, the following formats are
1508    also recognized as composition data on decoding.
1509
1510    0x80 MSEQ ...
1511    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1512
1513    Here,
1514         MSEQ is a multibyte form but in these special format:
1515           ASCII: 0xA0 ASCII_CODE+0x80,
1516           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1517         RULE is a one byte code of the range 0xA0..0xF0 that
1518         represents a composition rule.
1519   */
1520
1521 char emacs_mule_bytes[256];
1522
1523 /* Leading-code followed by extended leading-code.  */
1524 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1525 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1526 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1527 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1528
1529
1530 int
1531 emacs_mule_char (coding, composition, nbytes, nchars)
1532      struct coding_system *coding;
1533      int composition;
1534      int *nbytes, *nchars;
1535 {
1536   unsigned char *src = coding->source + coding->consumed;
1537   unsigned char *src_end = coding->source + coding->src_bytes;
1538   int multibytep = coding->src_multibyte;
1539   unsigned char *src_base = src;
1540   struct charset *charset;
1541   unsigned code;
1542   int c;
1543   int consumed_chars = 0;
1544
1545   ONE_MORE_BYTE (c);
1546   if (composition)
1547     {
1548       c -= 0x20;
1549       if (c == 0x80)
1550         {
1551           ONE_MORE_BYTE (c);
1552           if (c < 0xA0)
1553             goto invalid_code;
1554           *nbytes = src - src_base;
1555           *nchars = consumed_chars;
1556           return (c - 0x80);
1557         }
1558     }
1559
1560   switch (emacs_mule_bytes[c])
1561     {
1562     case 2:
1563       if (! (charset = emacs_mule_charset[c]))
1564         goto invalid_code;
1565       ONE_MORE_BYTE (c);
1566       code = c & 0x7F;
1567       break;
1568
1569     case 3:
1570       if (c == LEADING_CODE_PRIVATE_11
1571           || c == LEADING_CODE_PRIVATE_12)
1572         {
1573           ONE_MORE_BYTE (c);
1574           if (! (charset = emacs_mule_charset[c]))
1575             goto invalid_code;
1576           ONE_MORE_BYTE (c);
1577           code = c & 0x7F;
1578         }
1579       else
1580         {
1581           if (! (charset = emacs_mule_charset[c]))
1582             goto invalid_code;
1583           ONE_MORE_BYTE (c);
1584           code = (c & 0x7F) << 7;
1585           ONE_MORE_BYTE (c);
1586           code |= c & 0x7F;
1587         }
1588       break;
1589
1590     case 4:
1591       if (! (charset = emacs_mule_charset[c]))
1592         goto invalid_code;
1593       ONE_MORE_BYTE (c);
1594       code = (c & 0x7F) << 7;
1595       ONE_MORE_BYTE (c);
1596       code |= c & 0x7F;
1597       break;
1598
1599     case 1:
1600       code = c;
1601       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1602                                  : code < 0xA0 ? charset_8_bit_control
1603                                  : charset_8_bit_graphic);
1604       break;
1605
1606     default:
1607       abort ();
1608     }
1609   c = DECODE_CHAR (charset, code);
1610   if (c < 0)
1611     goto invalid_code;
1612   *nbytes = src - src_base;
1613   *nchars = consumed_chars;
1614   return c;
1615
1616  no_more_source:
1617   return -2;
1618
1619  invalid_code:
1620   return -1;
1621 }
1622
1623
1624 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1625    Check if a text is encoded in `emacs-mule'.  */
1626
1627 static int
1628 detect_coding_emacs_mule (coding, mask)
1629      struct coding_system *coding;
1630      int *mask;
1631 {
1632   unsigned char *src = coding->source, *src_base = src;
1633   unsigned char *src_end = coding->source + coding->src_bytes;
1634   int multibytep = coding->src_multibyte;
1635   int consumed_chars = 0;
1636   int c;
1637   int found = 0;
1638
1639   /* A coding system of this category is always ASCII compatible.  */
1640   src += coding->head_ascii;
1641
1642   while (1)
1643     {
1644       ONE_MORE_BYTE (c);
1645
1646       if (c == 0x80)
1647         {
1648           /* Perhaps the start of composite character.  We simple skip
1649              it because analyzing it is too heavy for detecting.  But,
1650              at least, we check that the composite character
1651              constitues of more than 4 bytes.  */
1652           unsigned char *src_base;
1653
1654         repeat:
1655           src_base = src;
1656           do
1657             {
1658               ONE_MORE_BYTE (c);
1659             }
1660           while (c >= 0xA0);
1661
1662           if (src - src_base <= 4)
1663             break;
1664           found = 1;
1665           if (c == 0x80)
1666             goto repeat;
1667         }
1668
1669       if (c < 0x80)
1670         {
1671           if (c < 0x20
1672               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1673             break;
1674         }
1675       else
1676         {
1677           unsigned char *src_base = src - 1;
1678
1679           do
1680             {
1681               ONE_MORE_BYTE (c);
1682             }
1683           while (c >= 0xA0);
1684           if (src - src_base != emacs_mule_bytes[*src_base])
1685             break;
1686           found = 1;
1687         }
1688     }
1689   *mask &= ~CATEGORY_MASK_EMACS_MULE;
1690   return 0;
1691
1692  no_more_source:
1693   if (!found)
1694     return 0;
1695   *mask &= CATEGORY_MASK_EMACS_MULE;
1696   return 1;
1697 }
1698
1699
1700 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1701
1702 /* Decode a character represented as a component of composition
1703    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1704    update SRC to the head of next character (or an encoded composition
1705    rule).  If SRC doesn't points a composition component, set C to -1.
1706    If SRC points an invalid byte sequence, global exit by a return
1707    value 0.  */
1708
1709 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1710   if (1)                                                        \
1711     {                                                           \
1712       int c;                                                    \
1713       int nbytes, nchars;                                       \
1714                                                                 \
1715       if (src == src_end)                                       \
1716         break;                                                  \
1717       c = emacs_mule_char (coding, 1, &nbytes, &nchars);        \
1718       if (c < 0)                                                \
1719         {                                                       \
1720           if (c == -2)                                          \
1721             break;                                              \
1722           goto invalid_code;                                    \
1723         }                                                       \
1724       *buf++ = c;                                               \
1725       src += nbytes;                                            \
1726       consumed_chars += nchars;                                 \
1727     }                                                           \
1728   else
1729
1730
1731 /* Decode a composition rule represented as a component of composition
1732    sequence of Emacs 20 style at SRC.  Set C to the rule.  If SRC
1733    points an invalid byte sequence, set C to -1.  */
1734
1735 #define DECODE_EMACS_MULE_COMPOSITION_RULE(buf)         \
1736   do {                                                  \
1737     int c, gref, nref;                                  \
1738                                                         \
1739     if (src < src_end)                                  \
1740       goto invalid_code;                                \
1741     ONE_MORE_BYTE_NO_CHECK (c);                         \
1742     c -= 0xA0;                                          \
1743     if (c < 0 || c >= 81)                               \
1744       goto invalid_code;                                \
1745                                                         \
1746     gref = c / 9, nref = c % 9;                         \
1747     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1748   } while (0)
1749
1750
1751 #define ADD_COMPOSITION_DATA(buf, method, nchars)       \
1752   do {                                                  \
1753     *buf++ = -5;                                        \
1754     *buf++ = coding->produced_char + char_offset;       \
1755     *buf++ = CODING_ANNOTATE_COMPOSITION_MASK;          \
1756     *buf++ = method;                                    \
1757     *buf++ = nchars;                                    \
1758   } while (0)
1759
1760
1761 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1762   do {                                                                  \
1763     /* Emacs 21 style format.  The first three bytes at SRC are         \
1764        (METHOD - 0xF0), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1765        the byte length of this composition information, CHARS is the    \
1766        number of characters composed by this composition.  */           \
1767     enum composition_method method = c - 0xF0;                          \
1768     int consumed_chars_limit;                                           \
1769     int nbytes, nchars;                                                 \
1770                                                                         \
1771     ONE_MORE_BYTE (c);                                                  \
1772     nbytes = c - 0xA0;                                                  \
1773     if (nbytes < 3)                                                     \
1774       goto invalid_code;                                                \
1775     ONE_MORE_BYTE (c);                                                  \
1776     nchars = c - 0xA0;                                                  \
1777     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
1778     consumed_chars_limit = consumed_chars_base + nbytes;                \
1779     if (method != COMPOSITION_RELATIVE)                                 \
1780       {                                                                 \
1781         int i = 0;                                                      \
1782         while (consumed_chars < consumed_chars_limit)                   \
1783           {                                                             \
1784             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1785               DECODE_EMACS_MULE_COMPOSITION_RULE (charbuf);             \
1786             else                                                        \
1787               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1788           }                                                             \
1789         if (consumed_chars < consumed_chars_limit)                      \
1790           goto invalid_code;                                            \
1791       }                                                                 \
1792   } while (0)
1793
1794
1795 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1796   do {                                                          \
1797     /* Emacs 20 style format for relative composition.  */      \
1798     /* Store multibyte form of characters to be composed.  */   \
1799     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1800     int *buf = components;                                      \
1801     int i, j;                                                   \
1802                                                                 \
1803     src = src_base;                                             \
1804     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1805     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1806       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1807     if (i < 2)                                                  \
1808       goto invalid_code;                                        \
1809     ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i);    \
1810     for (j = 0; j < i; j++)                                     \
1811       *charbuf++ = components[j];                               \
1812   } while (0)
1813
1814
1815 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1816   do {                                                          \
1817     /* Emacs 20 style format for rule-base composition.  */     \
1818     /* Store multibyte form of characters to be composed.  */   \
1819     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1820     int *buf = components;                                      \
1821     int i, j;                                                   \
1822                                                                 \
1823     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1824     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1825       {                                                         \
1826         DECODE_EMACS_MULE_COMPOSITION_RULE (buf);               \
1827         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1828       }                                                         \
1829     if (i < 1 || (buf - components) % 2 == 0)                   \
1830       goto invalid_code;                                        \
1831     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1832       goto no_more_source;                                      \
1833     ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i);       \
1834     for (j = 0; j < i; j++)                                     \
1835       *charbuf++ = components[j];                               \
1836     for (j = 0; j < i; j += 2)                                  \
1837       *charbuf++ = components[j];                               \
1838   } while (0)
1839
1840
1841 static void
1842 decode_coding_emacs_mule (coding)
1843      struct coding_system *coding;
1844 {
1845   unsigned char *src = coding->source + coding->consumed;
1846   unsigned char *src_end = coding->source + coding->src_bytes;
1847   unsigned char *src_base;
1848   int *charbuf = coding->charbuf;
1849   int *charbuf_end = charbuf + coding->charbuf_size;
1850   int consumed_chars = 0, consumed_chars_base;
1851   int char_offset = 0;
1852   int multibytep = coding->src_multibyte;
1853   Lisp_Object attrs, eol_type, charset_list;
1854
1855   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1856
1857   while (1)
1858     {
1859       int c;
1860
1861       src_base = src;
1862       consumed_chars_base = consumed_chars;
1863
1864       if (charbuf >= charbuf_end)
1865         break;
1866
1867       ONE_MORE_BYTE (c);
1868
1869       if (c < 0x80)
1870         {
1871           if (c == '\r')
1872             {
1873               if (EQ (eol_type, Qdos))
1874                 {
1875                   if (src == src_end)
1876                     goto no_more_source;
1877                   if (*src == '\n')
1878                     ONE_MORE_BYTE (c);
1879                 }
1880               else if (EQ (eol_type, Qmac))
1881                 c = '\n';
1882             }
1883           *charbuf++ = c;
1884           char_offset++;
1885         }
1886       else if (c == 0x80)
1887         {
1888           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1889             break;
1890           ONE_MORE_BYTE (c);
1891           if (c - 0xF0 >= COMPOSITION_RELATIVE
1892               && c - 0xF0 <= COMPOSITION_WITH_RULE_ALTCHARS)
1893             DECODE_EMACS_MULE_21_COMPOSITION (c);
1894           else if (c < 0xC0)
1895             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1896           else if (c == 0xFF)
1897             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1898           else
1899             goto invalid_code;
1900         }
1901       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1902         {
1903           int nbytes, nchars;
1904           src--;
1905           c = emacs_mule_char (coding, 0, &nbytes, &nchars);
1906           if (c < 0)
1907             {
1908               if (c == -2)
1909                 break;
1910               goto invalid_code;
1911             }
1912           *charbuf++ = c;
1913           char_offset++;
1914         }
1915       continue;
1916
1917     invalid_code:
1918       src = src_base;
1919       consumed_chars = consumed_chars_base;
1920       ONE_MORE_BYTE (c);
1921       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1922       coding->errors++;
1923     }
1924
1925  no_more_source:
1926   coding->consumed_char += consumed_chars_base;
1927   coding->consumed = src_base - coding->source;
1928   coding->charbuf_used = charbuf - coding->charbuf;
1929 }
1930
1931
1932 #define EMACS_MULE_LEADING_CODES(id, codes)     \
1933   do {                                          \
1934     if (id < 0xA0)                              \
1935       codes[0] = id, codes[1] = 0;              \
1936     else if (id < 0xE0)                         \
1937       codes[0] = 0x9A, codes[1] = id;           \
1938     else if (id < 0xF0)                         \
1939       codes[0] = 0x9B, codes[1] = id;           \
1940     else if (id < 0xF5)                         \
1941       codes[0] = 0x9C, codes[1] = id;           \
1942     else                                        \
1943       codes[0] = 0x9D, codes[1] = id;           \
1944   } while (0);
1945
1946
1947 static int
1948 encode_coding_emacs_mule (coding)
1949      struct coding_system *coding;
1950 {
1951   int multibytep = coding->dst_multibyte;
1952   int *charbuf = coding->charbuf;
1953   int *charbuf_end = charbuf + coding->charbuf_used;
1954   unsigned char *dst = coding->destination + coding->produced;
1955   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1956   int safe_room = 8;
1957   int produced_chars = 0;
1958   Lisp_Object attrs, eol_type, charset_list;
1959   int c;
1960
1961   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1962
1963   while (charbuf < charbuf_end)
1964     {
1965       ASSURE_DESTINATION (safe_room);
1966       c = *charbuf++;
1967       if (ASCII_CHAR_P (c))
1968         EMIT_ONE_ASCII_BYTE (c);
1969       else
1970         {
1971           struct charset *charset;
1972           unsigned code;
1973           int dimension;
1974           int emacs_mule_id;
1975           unsigned char leading_codes[2];
1976
1977           charset = char_charset (c, charset_list, &code);
1978           if (! charset)
1979             {
1980               c = coding->default_char;
1981               if (ASCII_CHAR_P (c))
1982                 {
1983                   EMIT_ONE_ASCII_BYTE (c);
1984                   continue;
1985                 }
1986               charset = char_charset (c, charset_list, &code);
1987             }
1988           dimension = CHARSET_DIMENSION (charset);
1989           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
1990           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
1991           EMIT_ONE_BYTE (leading_codes[0]);
1992           if (leading_codes[1])
1993             EMIT_ONE_BYTE (leading_codes[1]);
1994           if (dimension == 1)
1995             EMIT_ONE_BYTE (code);
1996           else
1997             {
1998               EMIT_ONE_BYTE (code >> 8);
1999               EMIT_ONE_BYTE (code & 0xFF);
2000             }
2001         }
2002     }
2003   coding->result = CODING_RESULT_SUCCESS;
2004   coding->produced_char += produced_chars;
2005   coding->produced = dst - coding->destination;
2006   return 0;
2007 }
2008
2009 \f
2010 /*** 7. ISO2022 handlers ***/
2011
2012 /* The following note describes the coding system ISO2022 briefly.
2013    Since the intention of this note is to help understand the
2014    functions in this file, some parts are NOT ACCURATE or OVERLY
2015    SIMPLIFIED.  For thorough understanding, please refer to the
2016    original document of ISO2022.
2017
2018    ISO2022 provides many mechanisms to encode several character sets
2019    in 7-bit and 8-bit environments.  For 7-bite environments, all text
2020    is encoded using bytes less than 128.  This may make the encoded
2021    text a little bit longer, but the text passes more easily through
2022    several gateways, some of which strip off MSB (Most Signigant Bit).
2023
2024    There are two kinds of character sets: control character set and
2025    graphic character set.  The former contains control characters such
2026    as `newline' and `escape' to provide control functions (control
2027    functions are also provided by escape sequences).  The latter
2028    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2029    two control character sets and many graphic character sets.
2030
2031    Graphic character sets are classified into one of the following
2032    four classes, according to the number of bytes (DIMENSION) and
2033    number of characters in one dimension (CHARS) of the set:
2034    - DIMENSION1_CHARS94
2035    - DIMENSION1_CHARS96
2036    - DIMENSION2_CHARS94
2037    - DIMENSION2_CHARS96
2038
2039    In addition, each character set is assigned an identification tag,
2040    unique for each set, called "final character" (denoted as <F>
2041    hereafter).  The <F> of each character set is decided by ECMA(*)
2042    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2043    (0x30..0x3F are for private use only).
2044
2045    Note (*): ECMA = European Computer Manufacturers Association
2046
2047    Here are examples of graphic character set [NAME(<F>)]:
2048         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2049         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2050         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2051         o DIMENSION2_CHARS96 -- none for the moment
2052
2053    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2054         C0 [0x00..0x1F] -- control character plane 0
2055         GL [0x20..0x7F] -- graphic character plane 0
2056         C1 [0x80..0x9F] -- control character plane 1
2057         GR [0xA0..0xFF] -- graphic character plane 1
2058
2059    A control character set is directly designated and invoked to C0 or
2060    C1 by an escape sequence.  The most common case is that:
2061    - ISO646's  control character set is designated/invoked to C0, and
2062    - ISO6429's control character set is designated/invoked to C1,
2063    and usually these designations/invocations are omitted in encoded
2064    text.  In a 7-bit environment, only C0 can be used, and a control
2065    character for C1 is encoded by an appropriate escape sequence to
2066    fit into the environment.  All control characters for C1 are
2067    defined to have corresponding escape sequences.
2068
2069    A graphic character set is at first designated to one of four
2070    graphic registers (G0 through G3), then these graphic registers are
2071    invoked to GL or GR.  These designations and invocations can be
2072    done independently.  The most common case is that G0 is invoked to
2073    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2074    these invocations and designations are omitted in encoded text.
2075    In a 7-bit environment, only GL can be used.
2076
2077    When a graphic character set of CHARS94 is invoked to GL, codes
2078    0x20 and 0x7F of the GL area work as control characters SPACE and
2079    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2080    be used.
2081
2082    There are two ways of invocation: locking-shift and single-shift.
2083    With locking-shift, the invocation lasts until the next different
2084    invocation, whereas with single-shift, the invocation affects the
2085    following character only and doesn't affect the locking-shift
2086    state.  Invocations are done by the following control characters or
2087    escape sequences:
2088
2089    ----------------------------------------------------------------------
2090    abbrev  function                  cntrl escape seq   description
2091    ----------------------------------------------------------------------
2092    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2093    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2094    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2095    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2096    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2097    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2098    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2099    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2100    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2101    ----------------------------------------------------------------------
2102    (*) These are not used by any known coding system.
2103
2104    Control characters for these functions are defined by macros
2105    ISO_CODE_XXX in `coding.h'.
2106
2107    Designations are done by the following escape sequences:
2108    ----------------------------------------------------------------------
2109    escape sequence      description
2110    ----------------------------------------------------------------------
2111    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2112    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2113    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2114    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2115    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2116    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2117    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2118    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2119    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2120    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2121    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2122    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2123    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2124    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2125    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2126    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2127    ----------------------------------------------------------------------
2128
2129    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2130    of dimension 1, chars 94, and final character <F>, etc...
2131
2132    Note (*): Although these designations are not allowed in ISO2022,
2133    Emacs accepts them on decoding, and produces them on encoding
2134    CHARS96 character sets in a coding system which is characterized as
2135    7-bit environment, non-locking-shift, and non-single-shift.
2136
2137    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2138    '(' must be omitted.  We refer to this as "short-form" hereafter.
2139
2140    Now you may notice that there are a lot of ways for encoding the
2141    same multilingual text in ISO2022.  Actually, there exist many
2142    coding systems such as Compound Text (used in X11's inter client
2143    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2144    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2145    localized platforms), and all of these are variants of ISO2022.
2146
2147    In addition to the above, Emacs handles two more kinds of escape
2148    sequences: ISO6429's direction specification and Emacs' private
2149    sequence for specifying character composition.
2150
2151    ISO6429's direction specification takes the following form:
2152         o CSI ']'      -- end of the current direction
2153         o CSI '0' ']'  -- end of the current direction
2154         o CSI '1' ']'  -- start of left-to-right text
2155         o CSI '2' ']'  -- start of right-to-left text
2156    The control character CSI (0x9B: control sequence introducer) is
2157    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2158
2159    Character composition specification takes the following form:
2160         o ESC '0' -- start relative composition
2161         o ESC '1' -- end composition
2162         o ESC '2' -- start rule-base composition (*)
2163         o ESC '3' -- start relative composition with alternate chars  (**)
2164         o ESC '4' -- start rule-base composition with alternate chars  (**)
2165   Since these are not standard escape sequences of any ISO standard,
2166   the use of them for these meaning is restricted to Emacs only.
2167
2168   (*) This form is used only in Emacs 20.5 and the older versions,
2169   but the newer versions can safely decode it.
2170   (**) This form is used only in Emacs 21.1 and the newer versions,
2171   and the older versions can't decode it.
2172
2173   Here's a list of examples usages of these composition escape
2174   sequences (categorized by `enum composition_method').
2175
2176   COMPOSITION_RELATIVE:
2177         ESC 0 CHAR [ CHAR ] ESC 1
2178   COMPOSITOIN_WITH_RULE:
2179         ESC 2 CHAR [ RULE CHAR ] ESC 1
2180   COMPOSITION_WITH_ALTCHARS:
2181         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2182   COMPOSITION_WITH_RULE_ALTCHARS:
2183         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2184
2185 enum iso_code_class_type iso_code_class[256];
2186
2187 #define SAFE_CHARSET_P(coding, id)      \
2188   ((id) <= (coding)->max_charset_id     \
2189    && (coding)->safe_charsets[id] >= 0)
2190
2191
2192 #define SHIFT_OUT_OK(category)  \
2193   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2194
2195 static void
2196 setup_iso_safe_charsets (Lisp_Object attrs)
2197 {
2198   Lisp_Object charset_list, safe_charsets;
2199   Lisp_Object request;
2200   Lisp_Object reg_usage;
2201   Lisp_Object tail;
2202   int reg94, reg96;
2203   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2204   int max_charset_id;
2205
2206   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2207   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2208       && ! EQ (charset_list, Viso_2022_charset_list))
2209     {
2210       CODING_ATTR_CHARSET_LIST (attrs)
2211         = charset_list = Viso_2022_charset_list;
2212       ASET (attrs, coding_attr_safe_charsets, Qnil);
2213     }
2214
2215   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2216     return;
2217
2218   max_charset_id = 0;
2219   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2220     {
2221       int id = XINT (XCAR (tail));
2222       if (max_charset_id < id)
2223         max_charset_id = id;
2224     }
2225
2226   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2227                                 make_number (255));
2228   request = AREF (attrs, coding_attr_iso_request);
2229   reg_usage = AREF (attrs, coding_attr_iso_usage);
2230   reg94 = XINT (XCAR (reg_usage));
2231   reg96 = XINT (XCDR (reg_usage));
2232
2233   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2234     {
2235       Lisp_Object id;
2236       Lisp_Object reg;
2237       struct charset *charset;
2238
2239       id = XCAR (tail);
2240       charset = CHARSET_FROM_ID (XINT (id));
2241       reg = Fcdr (Fassq (request, id));
2242       if (! NILP (reg))
2243         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2244       else if (charset->iso_chars_96)
2245         {
2246           if (reg96 < 4)
2247             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2248         }
2249       else
2250         {
2251           if (reg94 < 4)
2252             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2253         }
2254     }
2255   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2256 }
2257
2258
2259 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2260    Check if a text is encoded in ISO2022.  If it is, returns an
2261    integer in which appropriate flag bits any of:
2262         CATEGORY_MASK_ISO_7
2263         CATEGORY_MASK_ISO_7_TIGHT
2264         CATEGORY_MASK_ISO_8_1
2265         CATEGORY_MASK_ISO_8_2
2266         CATEGORY_MASK_ISO_7_ELSE
2267         CATEGORY_MASK_ISO_8_ELSE
2268    are set.  If a code which should never appear in ISO2022 is found,
2269    returns 0.  */
2270
2271 static int
2272 detect_coding_iso_2022 (coding, mask)
2273      struct coding_system *coding;
2274      int *mask;
2275 {
2276   unsigned char *src = coding->source, *src_base = src;
2277   unsigned char *src_end = coding->source + coding->src_bytes;
2278   int multibytep = coding->src_multibyte;
2279   int mask_iso = CATEGORY_MASK_ISO;
2280   int mask_found = 0, mask_8bit_found = 0;
2281   int reg[4], shift_out = 0, single_shifting = 0;
2282   int id;
2283   int c, c1;
2284   int consumed_chars = 0;
2285   int i;
2286
2287   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2288     {
2289       struct coding_system *this = &(coding_categories[i]);
2290       Lisp_Object attrs, val;
2291
2292       attrs = CODING_ID_ATTRS (this->id);
2293       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2294           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2295         setup_iso_safe_charsets (attrs);
2296       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2297       this->max_charset_id = XSTRING (val)->size - 1;
2298       this->safe_charsets = (char *) XSTRING (val)->data;
2299     }
2300
2301   /* A coding system of this category is always ASCII compatible.  */
2302   src += coding->head_ascii;
2303
2304   reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2305   while (mask_iso && src < src_end)
2306     {
2307       ONE_MORE_BYTE (c);
2308       switch (c)
2309         {
2310         case ISO_CODE_ESC:
2311           if (inhibit_iso_escape_detection)
2312             break;
2313           single_shifting = 0;
2314           ONE_MORE_BYTE (c);
2315           if (c >= '(' && c <= '/')
2316             {
2317               /* Designation sequence for a charset of dimension 1.  */
2318               ONE_MORE_BYTE (c1);
2319               if (c1 < ' ' || c1 >= 0x80
2320                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2321                 /* Invalid designation sequence.  Just ignore.  */
2322                 break;
2323               reg[(c - '(') % 4] = id;
2324             }
2325           else if (c == '$')
2326             {
2327               /* Designation sequence for a charset of dimension 2.  */
2328               ONE_MORE_BYTE (c);
2329               if (c >= '@' && c <= 'B')
2330                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2331                 reg[0] = id = iso_charset_table[1][0][c];
2332               else if (c >= '(' && c <= '/')
2333                 {
2334                   ONE_MORE_BYTE (c1);
2335                   if (c1 < ' ' || c1 >= 0x80
2336                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2337                     /* Invalid designation sequence.  Just ignore.  */
2338                     break;
2339                   reg[(c - '(') % 4] = id;
2340                 }
2341               else
2342                 /* Invalid designation sequence.  Just ignore.  */
2343                 break;
2344             }
2345           else if (c == 'N' || c == 'O')
2346             {
2347               /* ESC <Fe> for SS2 or SS3.  */
2348               mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2349               break;
2350             }
2351           else if (c >= '0' && c <= '4')
2352             {
2353               /* ESC <Fp> for start/end composition.  */
2354               mask_found |= CATEGORY_MASK_ISO;
2355               break;
2356             }
2357           else
2358             {
2359               /* Invalid escape sequence.  */
2360               mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2361               break;
2362             }
2363
2364           /* We found a valid designation sequence for CHARSET.  */
2365           mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2366           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2367                               id))
2368             mask_found |= CATEGORY_MASK_ISO_7;
2369           else
2370             mask_iso &= ~CATEGORY_MASK_ISO_7;
2371           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2372                               id))
2373             mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2374           else
2375             mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2376           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2377                               id))
2378             mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2379           else
2380             mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2381           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2382                               id))
2383             mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2384           else
2385             mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2386           break;
2387
2388         case ISO_CODE_SO:
2389           if (inhibit_iso_escape_detection)
2390             break;
2391           single_shifting = 0;
2392           if (shift_out == 0
2393               && (reg[1] >= 0
2394                   || SHIFT_OUT_OK (coding_category_iso_7_else)
2395                   || SHIFT_OUT_OK (coding_category_iso_8_else)))
2396             {
2397               /* Locking shift out.  */
2398               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2399               mask_found |= CATEGORY_MASK_ISO_ELSE;
2400             }
2401           break;
2402
2403         case ISO_CODE_SI:
2404           if (inhibit_iso_escape_detection)
2405             break;
2406           single_shifting = 0;
2407           if (shift_out == 1)
2408             {
2409               /* Locking shift in.  */
2410               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2411               mask_found |= CATEGORY_MASK_ISO_ELSE;
2412             }
2413           break;
2414
2415         case ISO_CODE_CSI:
2416           single_shifting = 0;
2417         case ISO_CODE_SS2:
2418         case ISO_CODE_SS3:
2419           {
2420             int newmask = CATEGORY_MASK_ISO_8_ELSE;
2421
2422             if (inhibit_iso_escape_detection)
2423               break;
2424             if (c != ISO_CODE_CSI)
2425               {
2426                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2427                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2428                   newmask |= CATEGORY_MASK_ISO_8_1;
2429                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2430                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2431                   newmask |= CATEGORY_MASK_ISO_8_2;
2432                 single_shifting = 1;
2433               }
2434             if (VECTORP (Vlatin_extra_code_table)
2435                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2436               {
2437                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2438                     & CODING_ISO_FLAG_LATIN_EXTRA)
2439                   newmask |= CATEGORY_MASK_ISO_8_1;
2440                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2441                     & CODING_ISO_FLAG_LATIN_EXTRA)
2442                   newmask |= CATEGORY_MASK_ISO_8_2;
2443               }
2444             mask_iso &= newmask;
2445             mask_found |= newmask;
2446           }
2447           break;
2448
2449         default:
2450           if (c < 0x80)
2451             {
2452               single_shifting = 0;
2453               break;
2454             }
2455           else if (c < 0xA0)
2456             {
2457               single_shifting = 0;
2458               mask_8bit_found = 1;
2459               if (VECTORP (Vlatin_extra_code_table)
2460                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2461                 {
2462                   int newmask = 0;
2463
2464                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2465                       & CODING_ISO_FLAG_LATIN_EXTRA)
2466                     newmask |= CATEGORY_MASK_ISO_8_1;
2467                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2468                       & CODING_ISO_FLAG_LATIN_EXTRA)
2469                     newmask |= CATEGORY_MASK_ISO_8_2;
2470                   mask_iso &= newmask;
2471                   mask_found |= newmask;
2472                 }
2473               else
2474                 return 0;
2475             }
2476           else
2477             {
2478               mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2479                             | CATEGORY_MASK_ISO_7_ELSE);
2480               mask_found |= CATEGORY_MASK_ISO_8_1;
2481               mask_8bit_found = 1;
2482               /* Check the length of succeeding codes of the range
2483                  0xA0..0FF.  If the byte length is odd, we exclude
2484                  CATEGORY_MASK_ISO_8_2.  We can check this only
2485                  when we are not single shifting.  */
2486               if (!single_shifting
2487                   && mask_iso & CATEGORY_MASK_ISO_8_2)
2488                 {
2489                   int i = 1;
2490                   while (src < src_end)
2491                     {
2492                       ONE_MORE_BYTE (c);
2493                       if (c < 0xA0)
2494                         break;
2495                       i++;
2496                     }
2497
2498                   if (i & 1 && src < src_end)
2499                     mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2500                   else
2501                     mask_found |= CATEGORY_MASK_ISO_8_2;
2502                 }
2503             }
2504           break;
2505         }
2506     }
2507  no_more_source:
2508   if (!mask_iso)
2509     {
2510       *mask &= ~CATEGORY_MASK_ISO;
2511       return 0;
2512     }
2513   if (!mask_found)
2514     return 0;
2515   *mask &= mask_iso & mask_found;
2516   if (! mask_8bit_found)
2517     *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2518   return 1;
2519 }
2520
2521
2522 /* Set designation state into CODING.  */
2523 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2524   do {                                                                  \
2525     int id, prev;                                                       \
2526                                                                         \
2527     if (final < '0' || final >= 128                                     \
2528         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2529         || !SAFE_CHARSET_P (coding, id))                                \
2530       {                                                                 \
2531         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2532         goto invalid_code;                                              \
2533       }                                                                 \
2534     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2535     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2536     /* If there was an invalid designation to REG previously, and this  \
2537        designation is ASCII to REG, we should keep this designation     \
2538        sequence.  */                                                    \
2539     if (prev == -2 && id == charset_ascii)                              \
2540       goto invalid_code;                                                \
2541   } while (0)
2542
2543
2544 #define MAYBE_FINISH_COMPOSITION()                              \
2545   do {                                                          \
2546     int i;                                                      \
2547     if (composition_state == COMPOSING_NO)                      \
2548       break;                                                    \
2549     /* It is assured that we have enough room for producing     \
2550        characters stored in the table `components'.  */         \
2551     if (charbuf + component_idx > charbuf_end)                  \
2552       goto no_more_source;                                      \
2553     composition_state = COMPOSING_NO;                           \
2554     if (method == COMPOSITION_RELATIVE                          \
2555         || method == COMPOSITION_WITH_ALTCHARS)                 \
2556       {                                                         \
2557         for (i = 0; i < component_idx; i++)                     \
2558           *charbuf++ = components[i];                           \
2559         char_offset += component_idx;                           \
2560       }                                                         \
2561     else                                                        \
2562       {                                                         \
2563         for (i = 0; i < component_idx; i += 2)                  \
2564           *charbuf++ = components[i];                           \
2565         char_offset += (component_idx / 2) + 1;                 \
2566       }                                                         \
2567   } while (0)
2568
2569
2570 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2571    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2572    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2573    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2574    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2575   */
2576
2577 #define DECODE_COMPOSITION_START(c1)                                    \
2578   do {                                                                  \
2579     if (c1 == '0'                                                       \
2580         && composition_state == COMPOSING_COMPONENT_CHAR)               \
2581       {                                                                 \
2582         component_len = component_idx;                                  \
2583         composition_state = COMPOSING_CHAR;                             \
2584       }                                                                 \
2585     else                                                                \
2586       {                                                                 \
2587         unsigned char *p;                                               \
2588                                                                         \
2589         MAYBE_FINISH_COMPOSITION ();                                    \
2590         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2591           goto no_more_source;                                          \
2592         for (p = src; p < src_end - 1; p++)                             \
2593           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2594             break;                                                      \
2595         if (p == src_end - 1)                                           \
2596           {                                                             \
2597             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2598               goto invalid_code;                                        \
2599             goto no_more_source;                                        \
2600           }                                                             \
2601                                                                         \
2602         /* This is surely the start of a composition.  */               \
2603         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2604                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2605                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2606                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2607         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2608                              : COMPOSING_COMPONENT_CHAR);               \
2609         component_idx = component_len = 0;                              \
2610       }                                                                 \
2611   } while (0)
2612
2613
2614 /* Handle compositoin end sequence ESC 1.  */
2615
2616 #define DECODE_COMPOSITION_END()                                        \
2617   do {                                                                  \
2618     int nchars = (component_len > 0 ? component_idx - component_len     \
2619                   : method == COMPOSITION_RELATIVE ? component_idx      \
2620                   : (component_idx + 1) / 2);                           \
2621     int i;                                                              \
2622     int *saved_charbuf = charbuf;                                       \
2623                                                                         \
2624     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
2625     if (method != COMPOSITION_RELATIVE)                                 \
2626       {                                                                 \
2627         if (component_len == 0)                                         \
2628           for (i = 0; i < component_idx; i++)                           \
2629             *charbuf++ = components[i];                                 \
2630         else                                                            \
2631           for (i = 0; i < component_len; i++)                           \
2632             *charbuf++ = components[i];                                 \
2633         *saved_charbuf = saved_charbuf - charbuf;                       \
2634       }                                                                 \
2635     if (method == COMPOSITION_WITH_RULE)                                \
2636       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2637         *charbuf++ = components[i];                                     \
2638     else                                                                \
2639       for (i = component_len; i < component_idx; i++, char_offset++)    \
2640         *charbuf++ = components[i];                                     \
2641     coding->annotated = 1;                                              \
2642     composition_state = COMPOSING_NO;                                   \
2643   } while (0)
2644
2645
2646 /* Decode a composition rule from the byte C1 (and maybe one more byte
2647    from SRC) and store one encoded composition rule in
2648    coding->cmp_data.  */
2649
2650 #define DECODE_COMPOSITION_RULE(c1)                                     \
2651   do {                                                                  \
2652     (c1) -= 32;                                                         \
2653     if (c1 < 81)                /* old format (before ver.21) */        \
2654       {                                                                 \
2655         int gref = (c1) / 9;                                            \
2656         int nref = (c1) % 9;                                            \
2657         if (gref == 4) gref = 10;                                       \
2658         if (nref == 4) nref = 10;                                       \
2659         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2660       }                                                                 \
2661     else if (c1 < 93)           /* new format (after ver.21) */         \
2662       {                                                                 \
2663         ONE_MORE_BYTE (c2);                                             \
2664         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2665       }                                                                 \
2666     else                                                                \
2667       c1 = 0;                                                           \
2668   } while (0)
2669
2670
2671 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2672
2673 static void
2674 decode_coding_iso_2022 (coding)
2675      struct coding_system *coding;
2676 {
2677   unsigned char *src = coding->source + coding->consumed;
2678   unsigned char *src_end = coding->source + coding->src_bytes;
2679   unsigned char *src_base;
2680   int *charbuf = coding->charbuf;
2681   int *charbuf_end = charbuf + coding->charbuf_size - 4;
2682   int consumed_chars = 0, consumed_chars_base;
2683   int char_offset = 0;
2684   int multibytep = coding->src_multibyte;
2685   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2686   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2687   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2688   struct charset *charset;
2689   int c;
2690   /* For handling composition sequence.  */
2691 #define COMPOSING_NO                    0
2692 #define COMPOSING_CHAR                  1
2693 #define COMPOSING_RULE                  2
2694 #define COMPOSING_COMPONENT_CHAR        3
2695 #define COMPOSING_COMPONENT_RULE        4
2696
2697   int composition_state = COMPOSING_NO;
2698   enum composition_method method;
2699   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2700   int component_idx;
2701   int component_len;
2702   Lisp_Object attrs, eol_type, charset_list;
2703
2704   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2705   setup_iso_safe_charsets (attrs);
2706
2707   while (1)
2708     {
2709       int c1, c2;
2710
2711       src_base = src;
2712       consumed_chars_base = consumed_chars;
2713
2714       if (charbuf >= charbuf_end)
2715         break;
2716
2717       ONE_MORE_BYTE (c1);
2718
2719       /* We produce no character or one character.  */
2720       switch (iso_code_class [c1])
2721         {
2722         case ISO_0x20_or_0x7F:
2723           if (composition_state != COMPOSING_NO)
2724             {
2725               if (composition_state == COMPOSING_RULE
2726                   || composition_state == COMPOSING_COMPONENT_RULE)
2727                 {
2728                   DECODE_COMPOSITION_RULE (c1);
2729                   components[component_idx++] = c1;
2730                   composition_state--;
2731                   continue;
2732                 }
2733               else if (method == COMPOSITION_WITH_RULE)
2734                 composition_state = COMPOSING_RULE;
2735               else if (method == COMPOSITION_WITH_RULE_ALTCHARS
2736                        && composition_state == COMPOSING_COMPONENT_CHAR)
2737                 composition_state = COMPOSING_COMPONENT_CHAR;
2738             }
2739           if (charset_id_0 < 0
2740               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2741             {
2742               /* This is SPACE or DEL.  */
2743               charset = CHARSET_FROM_ID (charset_ascii);
2744               break;
2745             }
2746           /* This is a graphic character, we fall down ...  */
2747
2748         case ISO_graphic_plane_0:
2749           if (composition_state == COMPOSING_RULE)
2750             {
2751               DECODE_COMPOSITION_RULE (c1);
2752               components[component_idx++] = c1;
2753               composition_state = COMPOSING_CHAR;
2754             }
2755           charset = CHARSET_FROM_ID (charset_id_0);
2756           break;
2757
2758         case ISO_0xA0_or_0xFF:
2759           if (charset_id_1 < 0
2760               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2761               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2762             goto invalid_code;
2763           /* This is a graphic character, we fall down ... */
2764
2765         case ISO_graphic_plane_1:
2766           if (charset_id_1 < 0)
2767             goto invalid_code;
2768           charset = CHARSET_FROM_ID (charset_id_1);
2769           break;
2770
2771         case ISO_carriage_return:
2772           if (c1 == '\r')
2773             {
2774               if (EQ (eol_type, Qdos))
2775                 {
2776                   if (src == src_end)
2777                     goto no_more_source;
2778                   if (*src == '\n')
2779                     ONE_MORE_BYTE (c1);
2780                 }
2781               else if (EQ (eol_type, Qmac))
2782                 c1 = '\n';
2783             }
2784           /* fall through */
2785
2786         case ISO_control_0:
2787           MAYBE_FINISH_COMPOSITION ();
2788           charset = CHARSET_FROM_ID (charset_ascii);
2789           break;
2790
2791         case ISO_control_1:
2792           MAYBE_FINISH_COMPOSITION ();
2793           goto invalid_code;
2794
2795         case ISO_shift_out:
2796           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2797               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2798             goto invalid_code;
2799           CODING_ISO_INVOCATION (coding, 0) = 1;
2800           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2801           continue;
2802
2803         case ISO_shift_in:
2804           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2805             goto invalid_code;
2806           CODING_ISO_INVOCATION (coding, 0) = 0;
2807           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2808           continue;
2809
2810         case ISO_single_shift_2_7:
2811         case ISO_single_shift_2:
2812           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2813             goto invalid_code;
2814           /* SS2 is handled as an escape sequence of ESC 'N' */
2815           c1 = 'N';
2816           goto label_escape_sequence;
2817
2818         case ISO_single_shift_3:
2819           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2820             goto invalid_code;
2821           /* SS2 is handled as an escape sequence of ESC 'O' */
2822           c1 = 'O';
2823           goto label_escape_sequence;
2824
2825         case ISO_control_sequence_introducer:
2826           /* CSI is handled as an escape sequence of ESC '[' ...  */
2827           c1 = '[';
2828           goto label_escape_sequence;
2829
2830         case ISO_escape:
2831           ONE_MORE_BYTE (c1);
2832         label_escape_sequence:
2833           /* Escape sequences handled here are invocation,
2834              designation, direction specification, and character
2835              composition specification.  */
2836           switch (c1)
2837             {
2838             case '&':           /* revision of following character set */
2839               ONE_MORE_BYTE (c1);
2840               if (!(c1 >= '@' && c1 <= '~'))
2841                 goto invalid_code;
2842               ONE_MORE_BYTE (c1);
2843               if (c1 != ISO_CODE_ESC)
2844                 goto invalid_code;
2845               ONE_MORE_BYTE (c1);
2846               goto label_escape_sequence;
2847
2848             case '$':           /* designation of 2-byte character set */
2849               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2850                 goto invalid_code;
2851               ONE_MORE_BYTE (c1);
2852               if (c1 >= '@' && c1 <= 'B')
2853                 {       /* designation of JISX0208.1978, GB2312.1980,
2854                            or JISX0208.1980 */
2855                   DECODE_DESIGNATION (0, 2, 0, c1);
2856                 }
2857               else if (c1 >= 0x28 && c1 <= 0x2B)
2858                 {       /* designation of DIMENSION2_CHARS94 character set */
2859                   ONE_MORE_BYTE (c2);
2860                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2861                 }
2862               else if (c1 >= 0x2C && c1 <= 0x2F)
2863                 {       /* designation of DIMENSION2_CHARS96 character set */
2864                   ONE_MORE_BYTE (c2);
2865                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2866                 }
2867               else
2868                 goto invalid_code;
2869               /* We must update these variables now.  */
2870               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2871               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2872               continue;
2873
2874             case 'n':           /* invocation of locking-shift-2 */
2875               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2876                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2877                 goto invalid_code;
2878               CODING_ISO_INVOCATION (coding, 0) = 2;
2879               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2880               continue;
2881
2882             case 'o':           /* invocation of locking-shift-3 */
2883               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2884                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2885                 goto invalid_code;
2886               CODING_ISO_INVOCATION (coding, 0) = 3;
2887               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2888               continue;
2889
2890             case 'N':           /* invocation of single-shift-2 */
2891               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2892                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2893                 goto invalid_code;
2894               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2895               ONE_MORE_BYTE (c1);
2896               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2897                 goto invalid_code;
2898               break;
2899
2900             case 'O':           /* invocation of single-shift-3 */
2901               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2902                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2903                 goto invalid_code;
2904               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2905               ONE_MORE_BYTE (c1);
2906               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2907                 goto invalid_code;
2908               break;
2909
2910             case '0': case '2': case '3': case '4': /* start composition */
2911               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2912                 goto invalid_code;
2913               DECODE_COMPOSITION_START (c1);
2914               continue;
2915
2916             case '1':           /* end composition */
2917               if (composition_state == COMPOSING_NO)
2918                 goto invalid_code;
2919               DECODE_COMPOSITION_END ();
2920               continue;
2921
2922             case '[':           /* specification of direction */
2923               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2924                 goto invalid_code;
2925               /* For the moment, nested direction is not supported.
2926                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2927                  left-to-right, and nozero means right-to-left.  */
2928               ONE_MORE_BYTE (c1);
2929               switch (c1)
2930                 {
2931                 case ']':       /* end of the current direction */
2932                   coding->mode &= ~CODING_MODE_DIRECTION;
2933
2934                 case '0':       /* end of the current direction */
2935                 case '1':       /* start of left-to-right direction */
2936                   ONE_MORE_BYTE (c1);
2937                   if (c1 == ']')
2938                     coding->mode &= ~CODING_MODE_DIRECTION;
2939                   else
2940                     goto invalid_code;
2941                   break;
2942
2943                 case '2':       /* start of right-to-left direction */
2944                   ONE_MORE_BYTE (c1);
2945                   if (c1 == ']')
2946                     coding->mode |= CODING_MODE_DIRECTION;
2947                   else
2948                     goto invalid_code;
2949                   break;
2950
2951                 default:
2952                   goto invalid_code;
2953                 }
2954               continue;
2955
2956             default:
2957               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2958                 goto invalid_code;
2959               if (c1 >= 0x28 && c1 <= 0x2B)
2960                 {       /* designation of DIMENSION1_CHARS94 character set */
2961                   ONE_MORE_BYTE (c2);
2962                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
2963                 }
2964               else if (c1 >= 0x2C && c1 <= 0x2F)
2965                 {       /* designation of DIMENSION1_CHARS96 character set */
2966                   ONE_MORE_BYTE (c2);
2967                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
2968                 }
2969               else
2970                 goto invalid_code;
2971               /* We must update these variables now.  */
2972               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2973               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2974               continue;
2975             }
2976         }
2977
2978       /* Now we know CHARSET and 1st position code C1 of a character.
2979          Produce a decoded character while getting 2nd position code
2980          C2 if necessary.  */
2981       c1 &= 0x7F;
2982       if (CHARSET_DIMENSION (charset) > 1)
2983         {
2984           ONE_MORE_BYTE (c2);
2985           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2986             /* C2 is not in a valid range.  */
2987             goto invalid_code;
2988           c1 = (c1 << 8) | (c2 & 0x7F);
2989           if (CHARSET_DIMENSION (charset) > 2)
2990             {
2991               ONE_MORE_BYTE (c2);
2992               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
2993                 /* C2 is not in a valid range.  */
2994                 goto invalid_code;
2995               c1 = (c1 << 8) | (c2 & 0x7F);
2996             }
2997         }
2998
2999       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3000       if (c < 0)
3001         {
3002           MAYBE_FINISH_COMPOSITION ();
3003           for (; src_base < src; src_base++, char_offset++)
3004             {
3005               if (ASCII_BYTE_P (*src_base))
3006                 *charbuf++ = *src_base;
3007               else
3008                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3009             }
3010         }
3011       else if (composition_state == COMPOSING_NO)
3012         {
3013           *charbuf++ = c;
3014           char_offset++;
3015         }
3016       else
3017         components[component_idx++] = c;
3018       continue;
3019
3020     invalid_code:
3021       MAYBE_FINISH_COMPOSITION ();
3022       src = src_base;
3023       consumed_chars = consumed_chars_base;
3024       ONE_MORE_BYTE (c);
3025       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3026       coding->errors++;
3027     }
3028
3029  no_more_source:
3030   coding->consumed_char += consumed_chars_base;
3031   coding->consumed = src_base - coding->source;
3032   coding->charbuf_used = charbuf - coding->charbuf;
3033 }
3034
3035
3036 /* ISO2022 encoding stuff.  */
3037
3038 /*
3039    It is not enough to say just "ISO2022" on encoding, we have to
3040    specify more details.  In Emacs, each coding system of ISO2022
3041    variant has the following specifications:
3042         1. Initial designation to G0 thru G3.
3043         2. Allows short-form designation?
3044         3. ASCII should be designated to G0 before control characters?
3045         4. ASCII should be designated to G0 at end of line?
3046         5. 7-bit environment or 8-bit environment?
3047         6. Use locking-shift?
3048         7. Use Single-shift?
3049    And the following two are only for Japanese:
3050         8. Use ASCII in place of JIS0201-1976-Roman?
3051         9. Use JISX0208-1983 in place of JISX0208-1978?
3052    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3053    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3054    details.
3055 */
3056
3057 /* Produce codes (escape sequence) for designating CHARSET to graphic
3058    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3059    '@', 'A', or 'B' and the coding system CODING allows, produce
3060    designation sequence of short-form.  */
3061
3062 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3063   do {                                                                  \
3064     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3065     char *intermediate_char_94 = "()*+";                                \
3066     char *intermediate_char_96 = ",-./";                                \
3067     int revision = -1;                                                  \
3068     int c;                                                              \
3069                                                                         \
3070     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3071       revision = XINT (CHARSET_ISO_REVISION (charset));                 \
3072                                                                         \
3073     if (revision >= 0)                                                  \
3074       {                                                                 \
3075         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3076         EMIT_ONE_BYTE ('@' + revision);                                 \
3077       }                                                                 \
3078     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3079     if (CHARSET_DIMENSION (charset) == 1)                               \
3080       {                                                                 \
3081         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3082           c = intermediate_char_94[reg];                                \
3083         else                                                            \
3084           c = intermediate_char_96[reg];                                \
3085         EMIT_ONE_ASCII_BYTE (c);                                        \
3086       }                                                                 \
3087     else                                                                \
3088       {                                                                 \
3089         EMIT_ONE_ASCII_BYTE ('$');                                      \
3090         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3091           {                                                             \
3092             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3093                 || reg != 0                                             \
3094                 || final_char < '@' || final_char > 'B')                \
3095               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3096           }                                                             \
3097         else                                                            \
3098           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3099       }                                                                 \
3100     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3101                                                                         \
3102     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3103   } while (0)
3104
3105
3106 /* The following two macros produce codes (control character or escape
3107    sequence) for ISO2022 single-shift functions (single-shift-2 and
3108    single-shift-3).  */
3109
3110 #define ENCODE_SINGLE_SHIFT_2                                           \
3111   do {                                                                  \
3112     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3113       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3114     else                                                                \
3115       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3116     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3117   } while (0)
3118
3119
3120 #define ENCODE_SINGLE_SHIFT_3                                           \
3121   do {                                                                  \
3122     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3123       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3124     else                                                                \
3125       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3126     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3127   } while (0)
3128
3129
3130 /* The following four macros produce codes (control character or
3131    escape sequence) for ISO2022 locking-shift functions (shift-in,
3132    shift-out, locking-shift-2, and locking-shift-3).  */
3133
3134 #define ENCODE_SHIFT_IN                                 \
3135   do {                                                  \
3136     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3137     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3138   } while (0)
3139
3140
3141 #define ENCODE_SHIFT_OUT                                \
3142   do {                                                  \
3143     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3144     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3145   } while (0)
3146
3147
3148 #define ENCODE_LOCKING_SHIFT_2                          \
3149   do {                                                  \
3150     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3151     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3152   } while (0)
3153
3154
3155 #define ENCODE_LOCKING_SHIFT_3                          \
3156   do {                                                  \
3157     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3158     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3159   } while (0)
3160
3161
3162 /* Produce codes for a DIMENSION1 character whose character set is
3163    CHARSET and whose position-code is C1.  Designation and invocation
3164    sequences are also produced in advance if necessary.  */
3165
3166 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3167   do {                                                                  \
3168     int id = CHARSET_ID (charset);                                      \
3169     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3170       {                                                                 \
3171         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3172           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3173         else                                                            \
3174           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3175         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3176         break;                                                          \
3177       }                                                                 \
3178     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3179       {                                                                 \
3180         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3181         break;                                                          \
3182       }                                                                 \
3183     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3184       {                                                                 \
3185         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3186         break;                                                          \
3187       }                                                                 \
3188     else                                                                \
3189       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3190          must invoke it, or, at first, designate it to some graphic     \
3191          register.  Then repeat the loop to actually produce the        \
3192          character.  */                                                 \
3193       dst = encode_invocation_designation (charset, coding, dst,        \
3194                                            &produced_chars);            \
3195   } while (1)
3196
3197
3198 /* Produce codes for a DIMENSION2 character whose character set is
3199    CHARSET and whose position-codes are C1 and C2.  Designation and
3200    invocation codes are also produced in advance if necessary.  */
3201
3202 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3203   do {                                                                  \
3204     int id = CHARSET_ID (charset);                                      \
3205     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3206       {                                                                 \
3207         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3208           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3209         else                                                            \
3210           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3211         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3212         break;                                                          \
3213       }                                                                 \
3214     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3215       {                                                                 \
3216         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3217         break;                                                          \
3218       }                                                                 \
3219     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3220       {                                                                 \
3221         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3222         break;                                                          \
3223       }                                                                 \
3224     else                                                                \
3225       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3226          must invoke it, or, at first, designate it to some graphic     \
3227          register.  Then repeat the loop to actually produce the        \
3228          character.  */                                                 \
3229       dst = encode_invocation_designation (charset, coding, dst,        \
3230                                            &produced_chars);            \
3231   } while (1)
3232
3233
3234 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3235   do {                                                                     \
3236     int code = ENCODE_CHAR ((charset),(c));                                \
3237                                                                            \
3238     if (CHARSET_DIMENSION (charset) == 1)                                  \
3239       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3240     else                                                                   \
3241       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3242   } while (0)
3243
3244
3245 /* Produce designation and invocation codes at a place pointed by DST
3246    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3247    Return new DST.  */
3248
3249 unsigned char *
3250 encode_invocation_designation (charset, coding, dst, p_nchars)
3251      struct charset *charset;
3252      struct coding_system *coding;
3253      unsigned char *dst;
3254      int *p_nchars;
3255 {
3256   int multibytep = coding->dst_multibyte;
3257   int produced_chars = *p_nchars;
3258   int reg;                      /* graphic register number */
3259   int id = CHARSET_ID (charset);
3260
3261   /* At first, check designations.  */
3262   for (reg = 0; reg < 4; reg++)
3263     if (id == CODING_ISO_DESIGNATION (coding, reg))
3264       break;
3265
3266   if (reg >= 4)
3267     {
3268       /* CHARSET is not yet designated to any graphic registers.  */
3269       /* At first check the requested designation.  */
3270       reg = CODING_ISO_REQUEST (coding, id);
3271       if (reg < 0)
3272         /* Since CHARSET requests no special designation, designate it
3273            to graphic register 0.  */
3274         reg = 0;
3275
3276       ENCODE_DESIGNATION (charset, reg, coding);
3277     }
3278
3279   if (CODING_ISO_INVOCATION (coding, 0) != reg
3280       && CODING_ISO_INVOCATION (coding, 1) != reg)
3281     {
3282       /* Since the graphic register REG is not invoked to any graphic
3283          planes, invoke it to graphic plane 0.  */
3284       switch (reg)
3285         {
3286         case 0:                 /* graphic register 0 */
3287           ENCODE_SHIFT_IN;
3288           break;
3289
3290         case 1:                 /* graphic register 1 */
3291           ENCODE_SHIFT_OUT;
3292           break;
3293
3294         case 2:                 /* graphic register 2 */
3295           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3296             ENCODE_SINGLE_SHIFT_2;
3297           else
3298             ENCODE_LOCKING_SHIFT_2;
3299           break;
3300
3301         case 3:                 /* graphic register 3 */
3302           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3303             ENCODE_SINGLE_SHIFT_3;
3304           else
3305             ENCODE_LOCKING_SHIFT_3;
3306           break;
3307         }
3308     }
3309
3310   *p_nchars = produced_chars;
3311   return dst;
3312 }
3313
3314 /* The following three macros produce codes for indicating direction
3315    of text.  */
3316 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3317   do {                                                                  \
3318     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3319       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3320     else                                                                \
3321       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3322   } while (0)
3323
3324
3325 #define ENCODE_DIRECTION_R2L()                  \
3326   do {                                          \
3327     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3328     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3329   } while (0)
3330
3331
3332 #define ENCODE_DIRECTION_L2R()                  \
3333   do {                                          \
3334     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3335     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3336   } while (0)
3337
3338
3339 /* Produce codes for designation and invocation to reset the graphic
3340    planes and registers to initial state.  */
3341 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3342   do {                                                                  \
3343     int reg;                                                            \
3344     struct charset *charset;                                            \
3345                                                                         \
3346     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3347       ENCODE_SHIFT_IN;                                                  \
3348     for (reg = 0; reg < 4; reg++)                                       \
3349       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3350           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3351               != CODING_ISO_INITIAL (coding, reg)))                     \
3352         {                                                               \
3353           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3354           ENCODE_DESIGNATION (charset, reg, coding);                    \
3355         }                                                               \
3356   } while (0)
3357
3358
3359 /* Produce designation sequences of charsets in the line started from
3360    SRC to a place pointed by DST, and return updated DST.
3361
3362    If the current block ends before any end-of-line, we may fail to
3363    find all the necessary designations.  */
3364
3365 static unsigned char *
3366 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3367      struct coding_system *coding;
3368      int *charbuf, *charbuf_end;
3369      unsigned char *dst;
3370 {
3371   struct charset *charset;
3372   /* Table of charsets to be designated to each graphic register.  */
3373   int r[4];
3374   int c, found = 0, reg;
3375   int produced_chars = 0;
3376   int multibytep = coding->dst_multibyte;
3377   Lisp_Object attrs;
3378   Lisp_Object charset_list;
3379
3380   attrs = CODING_ID_ATTRS (coding->id);
3381   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3382   if (EQ (charset_list, Qiso_2022))
3383     charset_list = Viso_2022_charset_list;
3384
3385   for (reg = 0; reg < 4; reg++)
3386     r[reg] = -1;
3387
3388   while (found < 4)
3389     {
3390       int id;
3391
3392       c = *charbuf++;
3393       if (c == '\n')
3394         break;
3395       charset = char_charset (c, charset_list, NULL);
3396       id = CHARSET_ID (charset);
3397       reg = CODING_ISO_REQUEST (coding, id);
3398       if (reg >= 0 && r[reg] < 0)
3399         {
3400           found++;
3401           r[reg] = id;
3402         }
3403     }
3404
3405   if (found)
3406     {
3407       for (reg = 0; reg < 4; reg++)
3408         if (r[reg] >= 0
3409             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3410           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3411     }
3412
3413   return dst;
3414 }
3415
3416 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3417
3418 static int
3419 encode_coding_iso_2022 (coding)
3420      struct coding_system *coding;
3421 {
3422   int multibytep = coding->dst_multibyte;
3423   int *charbuf = coding->charbuf;
3424   int *charbuf_end = charbuf + coding->charbuf_used;
3425   unsigned char *dst = coding->destination + coding->produced;
3426   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3427   int safe_room = 16;
3428   int bol_designation
3429     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3430        && CODING_ISO_BOL (coding));
3431   int produced_chars = 0;
3432   Lisp_Object attrs, eol_type, charset_list;
3433   int ascii_compatible;
3434   int c;
3435
3436   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3437
3438   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3439
3440   while (charbuf < charbuf_end)
3441     {
3442       ASSURE_DESTINATION (safe_room);
3443
3444       if (bol_designation)
3445         {
3446           unsigned char *dst_prev = dst;
3447
3448           /* We have to produce designation sequences if any now.  */
3449           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3450           bol_designation = 0;
3451           /* We are sure that designation sequences are all ASCII bytes.  */
3452           produced_chars += dst - dst_prev;
3453         }
3454
3455       c = *charbuf++;
3456
3457       /* Now encode the character C.  */
3458       if (c < 0x20 || c == 0x7F)
3459         {
3460           if (c == '\n'
3461               || (c == '\r' && EQ (eol_type, Qmac)))
3462             {
3463               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3464                 ENCODE_RESET_PLANE_AND_REGISTER ();
3465               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3466                 {
3467                   int i;
3468
3469                   for (i = 0; i < 4; i++)
3470                     CODING_ISO_DESIGNATION (coding, i)
3471                       = CODING_ISO_INITIAL (coding, i);
3472                 }
3473               bol_designation
3474                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3475             }
3476           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3477             ENCODE_RESET_PLANE_AND_REGISTER ();
3478           EMIT_ONE_ASCII_BYTE (c);
3479         }
3480       else if (ASCII_CHAR_P (c))
3481         {
3482           if (ascii_compatible)
3483             EMIT_ONE_ASCII_BYTE (c);
3484           else
3485             ENCODE_ISO_CHARACTER (CHARSET_FROM_ID (charset_ascii), c);
3486         }
3487       else
3488         {
3489           struct charset *charset = char_charset (c, charset_list, NULL);
3490
3491           if (!charset)
3492             {
3493               c = coding->default_char;
3494               charset = char_charset (c, charset_list, NULL);
3495             }
3496           ENCODE_ISO_CHARACTER (charset, c);
3497         }
3498     }
3499
3500   if (coding->mode & CODING_MODE_LAST_BLOCK
3501       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3502     {
3503       ASSURE_DESTINATION (safe_room);
3504       ENCODE_RESET_PLANE_AND_REGISTER ();
3505     }
3506   coding->result = CODING_RESULT_SUCCESS;
3507   CODING_ISO_BOL (coding) = bol_designation;
3508   coding->produced_char += produced_chars;
3509   coding->produced = dst - coding->destination;
3510   return 0;
3511 }
3512
3513 \f
3514 /*** 8,9. SJIS and BIG5 handlers ***/
3515
3516 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3517    quite widely.  So, for the moment, Emacs supports them in the bare
3518    C code.  But, in the future, they may be supported only by CCL.  */
3519
3520 /* SJIS is a coding system encoding three character sets: ASCII, right
3521    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3522    as is.  A character of charset katakana-jisx0201 is encoded by
3523    "position-code + 0x80".  A character of charset japanese-jisx0208
3524    is encoded in 2-byte but two position-codes are divided and shifted
3525    so that it fit in the range below.
3526
3527    --- CODE RANGE of SJIS ---
3528    (character set)      (range)
3529    ASCII                0x00 .. 0x7F
3530    KATAKANA-JISX0201    0xA0 .. 0xDF
3531    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3532             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3533    -------------------------------
3534
3535 */
3536
3537 /* BIG5 is a coding system encoding two character sets: ASCII and
3538    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3539    character set and is encoded in two-byte.
3540
3541    --- CODE RANGE of BIG5 ---
3542    (character set)      (range)
3543    ASCII                0x00 .. 0x7F
3544    Big5 (1st byte)      0xA1 .. 0xFE
3545         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3546    --------------------------
3547
3548   */
3549
3550 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3551    Check if a text is encoded in SJIS.  If it is, return
3552    CATEGORY_MASK_SJIS, else return 0.  */
3553
3554 static int
3555 detect_coding_sjis (coding, mask)
3556      struct coding_system *coding;
3557      int *mask;
3558 {
3559   unsigned char *src = coding->source, *src_base = src;
3560   unsigned char *src_end = coding->source + coding->src_bytes;
3561   int multibytep = coding->src_multibyte;
3562   int consumed_chars = 0;
3563   int found = 0;
3564   int c;
3565
3566   /* A coding system of this category is always ASCII compatible.  */
3567   src += coding->head_ascii;
3568
3569   while (1)
3570     {
3571       ONE_MORE_BYTE (c);
3572       if (c < 0x80)
3573         continue;
3574       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3575         {
3576           ONE_MORE_BYTE (c);
3577           if (c < 0x40 || c == 0x7F || c > 0xFC)
3578             break;
3579           found = 1;
3580         }
3581       else if (c >= 0xA0 && c < 0xE0)
3582         found = 1;
3583       else
3584         break;
3585     }
3586   *mask &= ~CATEGORY_MASK_SJIS;
3587   return 0;
3588
3589  no_more_source:
3590   if (!found)
3591     return 0;
3592   *mask &= CATEGORY_MASK_SJIS;
3593   return 1;
3594 }
3595
3596 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3597    Check if a text is encoded in BIG5.  If it is, return
3598    CATEGORY_MASK_BIG5, else return 0.  */
3599
3600 static int
3601 detect_coding_big5 (coding, mask)
3602      struct coding_system *coding;
3603      int *mask;
3604 {
3605   unsigned char *src = coding->source, *src_base = src;
3606   unsigned char *src_end = coding->source + coding->src_bytes;
3607   int multibytep = coding->src_multibyte;
3608   int consumed_chars = 0;
3609   int found = 0;
3610   int c;
3611
3612   /* A coding system of this category is always ASCII compatible.  */
3613   src += coding->head_ascii;
3614
3615   while (1)
3616     {
3617       ONE_MORE_BYTE (c);
3618       if (c < 0x80)
3619         continue;
3620       if (c >= 0xA1)
3621         {
3622           ONE_MORE_BYTE (c);
3623           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3624             return 0;
3625           found = 1;
3626         }
3627       else
3628         break;
3629     }
3630   *mask &= ~CATEGORY_MASK_BIG5;
3631   return 0;
3632
3633  no_more_source:
3634   if (!found)
3635     return 0;
3636   *mask &= CATEGORY_MASK_BIG5;
3637   return 1;
3638 }
3639
3640 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3641    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3642
3643 static void
3644 decode_coding_sjis (coding)
3645      struct coding_system *coding;
3646 {
3647   unsigned char *src = coding->source + coding->consumed;
3648   unsigned char *src_end = coding->source + coding->src_bytes;
3649   unsigned char *src_base;
3650   int *charbuf = coding->charbuf;
3651   int *charbuf_end = charbuf + coding->charbuf_size;
3652   int consumed_chars = 0, consumed_chars_base;
3653   int multibytep = coding->src_multibyte;
3654   struct charset *charset_roman, *charset_kanji, *charset_kana;
3655   Lisp_Object attrs, eol_type, charset_list, val;
3656
3657   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3658
3659   val = charset_list;
3660   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3661   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3662   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3663
3664   while (1)
3665     {
3666       int c, c1;
3667
3668       src_base = src;
3669       consumed_chars_base = consumed_chars;
3670
3671       if (charbuf >= charbuf_end)
3672         break;
3673
3674       ONE_MORE_BYTE (c);
3675
3676       if (c == '\r')
3677         {
3678           if (EQ (eol_type, Qdos))
3679             {
3680               if (src == src_end)
3681                 goto no_more_source;
3682               if (*src == '\n')
3683                 ONE_MORE_BYTE (c);
3684             }
3685           else if (EQ (eol_type, Qmac))
3686             c = '\n';
3687         }
3688       else
3689         {
3690           struct charset *charset;
3691
3692           if (c < 0x80)
3693             charset = charset_roman;
3694           else
3695             {
3696               if (c >= 0xF0)
3697                 goto invalid_code;
3698               if (c < 0xA0 || c >= 0xE0)
3699                 {
3700                   /* SJIS -> JISX0208 */
3701                   ONE_MORE_BYTE (c1);
3702                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3703                     goto invalid_code;
3704                   c = (c << 8) | c1;
3705                   SJIS_TO_JIS (c);
3706                   charset = charset_kanji;
3707                 }
3708               else
3709                 /* SJIS -> JISX0201-Kana */
3710                 charset = charset_kana;
3711             }
3712           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3713         }
3714       *charbuf++ = c;
3715       continue;
3716
3717     invalid_code:
3718       src = src_base;
3719       consumed_chars = consumed_chars_base;
3720       ONE_MORE_BYTE (c);
3721       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3722       coding->errors++;
3723     }
3724
3725  no_more_source:
3726   coding->consumed_char += consumed_chars_base;
3727   coding->consumed = src_base - coding->source;
3728   coding->charbuf_used = charbuf - coding->charbuf;
3729 }
3730
3731 static void
3732 decode_coding_big5 (coding)
3733      struct coding_system *coding;
3734 {
3735   unsigned char *src = coding->source + coding->consumed;
3736   unsigned char *src_end = coding->source + coding->src_bytes;
3737   unsigned char *src_base;
3738   int *charbuf = coding->charbuf;
3739   int *charbuf_end = charbuf + coding->charbuf_size;
3740   int consumed_chars = 0, consumed_chars_base;
3741   int multibytep = coding->src_multibyte;
3742   struct charset *charset_roman, *charset_big5;
3743   Lisp_Object attrs, eol_type, charset_list, val;
3744
3745   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3746   val = charset_list;
3747   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3748   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3749
3750   while (1)
3751     {
3752       int c, c1;
3753
3754       src_base = src;
3755       consumed_chars_base = consumed_chars;
3756
3757       if (charbuf >= charbuf_end)
3758         break;
3759
3760       ONE_MORE_BYTE (c);
3761
3762       if (c == '\r')
3763         {
3764           if (EQ (eol_type, Qdos))
3765             {
3766               if (src == src_end)
3767                 goto no_more_source;
3768               if (*src == '\n')
3769                 ONE_MORE_BYTE (c);
3770             }
3771           else if (EQ (eol_type, Qmac))
3772             c = '\n';
3773         }
3774       else
3775         {
3776           struct charset *charset;
3777           if (c < 0x80)
3778             charset = charset_roman;
3779           else
3780             {
3781               /* BIG5 -> Big5 */
3782               if (c < 0xA1 || c > 0xFE)
3783                 goto invalid_code;
3784               ONE_MORE_BYTE (c1);
3785               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3786                 goto invalid_code;
3787               c = c << 8 | c1;
3788               charset = charset_big5;
3789             }
3790           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3791         }
3792
3793       *charbuf++ = c;
3794       continue;
3795
3796     invalid_code:
3797       src = src_base;
3798       consumed_chars = consumed_chars_base;
3799       ONE_MORE_BYTE (c);
3800       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3801       coding->errors++;
3802     }
3803
3804  no_more_source:
3805   coding->consumed_char += consumed_chars_base;
3806   coding->consumed = src_base - coding->source;
3807   coding->charbuf_used = charbuf - coding->charbuf;
3808 }
3809
3810 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3811    This function can encode charsets `ascii', `katakana-jisx0201',
3812    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3813    are sure that all these charsets are registered as official charset
3814    (i.e. do not have extended leading-codes).  Characters of other
3815    charsets are produced without any encoding.  If SJIS_P is 1, encode
3816    SJIS text, else encode BIG5 text.  */
3817
3818 static int
3819 encode_coding_sjis (coding)
3820      struct coding_system *coding;
3821 {
3822   int multibytep = coding->dst_multibyte;
3823   int *charbuf = coding->charbuf;
3824   int *charbuf_end = charbuf + coding->charbuf_used;
3825   unsigned char *dst = coding->destination + coding->produced;
3826   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3827   int safe_room = 4;
3828   int produced_chars = 0;
3829   Lisp_Object attrs, eol_type, charset_list, val;
3830   int ascii_compatible;
3831   struct charset *charset_roman, *charset_kanji, *charset_kana;
3832   int c;
3833
3834   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3835   val = charset_list;
3836   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3837   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3838   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3839
3840   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3841
3842   while (charbuf < charbuf_end)
3843     {
3844       ASSURE_DESTINATION (safe_room);
3845       c = *charbuf++;
3846       /* Now encode the character C.  */
3847       if (ASCII_CHAR_P (c) && ascii_compatible)
3848         EMIT_ONE_ASCII_BYTE (c);
3849       else
3850         {
3851           unsigned code;
3852           struct charset *charset = char_charset (c, charset_list, &code);
3853
3854           if (!charset)
3855             {
3856               c = coding->default_char;
3857               charset = char_charset (c, charset_list, &code);
3858             }
3859           if (code == CHARSET_INVALID_CODE (charset))
3860             abort ();
3861           if (charset == charset_kanji)
3862             {
3863               int c1, c2;
3864               JIS_TO_SJIS (code);
3865               c1 = code >> 8, c2 = code & 0xFF;
3866               EMIT_TWO_BYTES (c1, c2);
3867             }
3868           else if (charset == charset_kana)
3869             EMIT_ONE_BYTE (code | 0x80);
3870           else
3871             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3872         }
3873     }
3874   coding->result = CODING_RESULT_SUCCESS;
3875   coding->produced_char += produced_chars;
3876   coding->produced = dst - coding->destination;
3877   return 0;
3878 }
3879
3880 static int
3881 encode_coding_big5 (coding)
3882      struct coding_system *coding;
3883 {
3884   int multibytep = coding->dst_multibyte;
3885   int *charbuf = coding->charbuf;
3886   int *charbuf_end = charbuf + coding->charbuf_used;
3887   unsigned char *dst = coding->destination + coding->produced;
3888   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3889   int safe_room = 4;
3890   int produced_chars = 0;
3891   Lisp_Object attrs, eol_type, charset_list, val;
3892   int ascii_compatible;
3893   struct charset *charset_roman, *charset_big5;
3894   int c;
3895
3896   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3897   val = charset_list;
3898   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3899   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3900   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3901
3902   while (charbuf < charbuf_end)
3903     {
3904       ASSURE_DESTINATION (safe_room);
3905       c = *charbuf++;
3906       /* Now encode the character C.  */
3907       if (ASCII_CHAR_P (c) && ascii_compatible)
3908         EMIT_ONE_ASCII_BYTE (c);
3909       else
3910         {
3911           unsigned code;
3912           struct charset *charset = char_charset (c, charset_list, &code);
3913
3914           if (! charset)
3915             {
3916               c = coding->default_char;
3917               charset = char_charset (c, charset_list, &code);
3918             }
3919           if (code == CHARSET_INVALID_CODE (charset))
3920             abort ();
3921           if (charset == charset_big5)
3922             {
3923               int c1, c2;
3924
3925               c1 = code >> 8, c2 = code & 0xFF;
3926               EMIT_TWO_BYTES (c1, c2);
3927             }
3928           else
3929             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3930         }
3931     }
3932   coding->result = CODING_RESULT_SUCCESS;
3933   coding->produced_char += produced_chars;
3934   coding->produced = dst - coding->destination;
3935   return 0;
3936 }
3937
3938 \f
3939 /*** 10. CCL handlers ***/
3940
3941 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3942    Check if a text is encoded in a coding system of which
3943    encoder/decoder are written in CCL program.  If it is, return
3944    CATEGORY_MASK_CCL, else return 0.  */
3945
3946 static int
3947 detect_coding_ccl (coding, mask)
3948      struct coding_system *coding;
3949      int *mask;
3950 {
3951   unsigned char *src = coding->source, *src_base = src;
3952   unsigned char *src_end = coding->source + coding->src_bytes;
3953   int multibytep = coding->src_multibyte;
3954   int consumed_chars = 0;
3955   int found = 0;
3956   unsigned char *valids = CODING_CCL_VALIDS (coding);
3957   int head_ascii = coding->head_ascii;
3958   Lisp_Object attrs;
3959
3960   coding = &coding_categories[coding_category_ccl];
3961   attrs = CODING_ID_ATTRS (coding->id);
3962   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
3963     src += head_ascii;
3964
3965   while (1)
3966     {
3967       int c;
3968       ONE_MORE_BYTE (c);
3969       if (! valids[c])
3970         break;
3971       if (!found && valids[c] > 1)
3972         found = 1;
3973     }
3974   *mask &= ~CATEGORY_MASK_CCL;
3975   return 0;
3976
3977  no_more_source:
3978   if (!found)
3979     return 0;
3980   *mask &= CATEGORY_MASK_CCL;
3981   return 1;
3982 }
3983
3984 static void
3985 decode_coding_ccl (coding)
3986      struct coding_system *coding;
3987 {
3988   unsigned char *src = coding->source + coding->consumed;
3989   unsigned char *src_end = coding->source + coding->src_bytes;
3990   int *charbuf = coding->charbuf;
3991   int *charbuf_end = charbuf + coding->charbuf_size;
3992   int consumed_chars = 0;
3993   int multibytep = coding->src_multibyte;
3994   struct ccl_program ccl;
3995   int source_charbuf[1024];
3996   int source_byteidx[1024];
3997
3998   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
3999
4000   while (src < src_end)
4001     {
4002       unsigned char *p = src;
4003       int *source, *source_end;
4004       int i = 0;
4005
4006       if (multibytep)
4007         while (i < 1024 && p < src_end)
4008           {
4009             source_byteidx[i] = p - src;
4010             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4011           }
4012       else
4013         while (i < 1024 && p < src_end)
4014           source_charbuf[i++] = *p++;
4015
4016       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4017         ccl.last_block = 1;
4018
4019       source = source_charbuf;
4020       source_end = source + i;
4021       while (source < source_end)
4022         {
4023           ccl_driver (&ccl, source, charbuf,
4024                       source_end - source, charbuf_end - charbuf);
4025           source += ccl.consumed;
4026           charbuf += ccl.produced;
4027           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4028             break;
4029         }
4030       if (source < source_end)
4031         src += source_byteidx[source - source_charbuf];
4032       else
4033         src = p;
4034       consumed_chars += source - source_charbuf;
4035
4036       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4037           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4038         break;
4039     }
4040
4041   switch (ccl.status)
4042     {
4043     case CCL_STAT_SUSPEND_BY_SRC:
4044       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4045       break;
4046     case CCL_STAT_SUSPEND_BY_DST:
4047       break;
4048     case CCL_STAT_QUIT:
4049     case CCL_STAT_INVALID_CMD:
4050       coding->result = CODING_RESULT_INTERRUPT;
4051       break;
4052     default:
4053       coding->result = CODING_RESULT_SUCCESS;
4054       break;
4055     }
4056   coding->consumed_char += consumed_chars;
4057   coding->consumed = src - coding->source;
4058   coding->charbuf_used = charbuf - coding->charbuf;
4059 }
4060
4061 static int
4062 encode_coding_ccl (coding)
4063      struct coding_system *coding;
4064 {
4065   struct ccl_program ccl;
4066   int multibytep = coding->dst_multibyte;
4067   int *charbuf = coding->charbuf;
4068   int *charbuf_end = charbuf + coding->charbuf_used;
4069   unsigned char *dst = coding->destination + coding->produced;
4070   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4071   unsigned char *adjusted_dst_end = dst_end - 1;
4072   int destination_charbuf[1024];
4073   int i, produced_chars = 0;
4074
4075   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4076
4077   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4078   ccl.dst_multibyte = coding->dst_multibyte;
4079
4080   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4081     {
4082       int dst_bytes = dst_end - dst;
4083       if (dst_bytes > 1024)
4084         dst_bytes = 1024;
4085
4086       ccl_driver (&ccl, charbuf, destination_charbuf,
4087                   charbuf_end - charbuf, dst_bytes);
4088       charbuf += ccl.consumed;
4089       if (multibytep)
4090         for (i = 0; i < ccl.produced; i++)
4091           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4092       else
4093         {
4094           for (i = 0; i < ccl.produced; i++)
4095             *dst++ = destination_charbuf[i] & 0xFF;
4096           produced_chars += ccl.produced;
4097         }
4098     }
4099
4100   switch (ccl.status)
4101     {
4102     case CCL_STAT_SUSPEND_BY_SRC:
4103       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4104       break;
4105     case CCL_STAT_SUSPEND_BY_DST:
4106       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4107       break;
4108     case CCL_STAT_QUIT:
4109     case CCL_STAT_INVALID_CMD:
4110       coding->result = CODING_RESULT_INTERRUPT;
4111       break;
4112     default:
4113       coding->result = CODING_RESULT_SUCCESS;
4114       break;
4115     }
4116
4117   coding->produced_char += produced_chars;
4118   coding->produced = dst - coding->destination;
4119   return 0;
4120 }
4121
4122
4123 \f
4124 /*** 10, 11. no-conversion handlers ***/
4125
4126 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4127
4128 static void
4129 decode_coding_raw_text (coding)
4130      struct coding_system *coding;
4131 {
4132   coding->chars_at_source = 1;
4133   coding->consumed_char = 0;
4134   coding->consumed = 0;
4135   coding->result = CODING_RESULT_SUCCESS;
4136 }
4137
4138 static int
4139 encode_coding_raw_text (coding)
4140      struct coding_system *coding;
4141 {
4142   int multibytep = coding->dst_multibyte;
4143   int *charbuf = coding->charbuf;
4144   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4145   unsigned char *dst = coding->destination + coding->produced;
4146   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4147   int produced_chars = 0;
4148   int c;
4149
4150   if (multibytep)
4151     {
4152       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4153
4154       if (coding->src_multibyte)
4155         while (charbuf < charbuf_end)
4156           {
4157             ASSURE_DESTINATION (safe_room);
4158             c = *charbuf++;
4159             if (ASCII_CHAR_P (c))
4160               EMIT_ONE_ASCII_BYTE (c);
4161             else if (CHAR_BYTE8_P (c))
4162               {
4163                 c = CHAR_TO_BYTE8 (c);
4164                 EMIT_ONE_BYTE (c);
4165               }
4166             else
4167               {
4168                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4169
4170                 CHAR_STRING_ADVANCE (c, p1);
4171                 while (p0 < p1)
4172                   EMIT_ONE_BYTE (*p0);
4173               }
4174           }
4175       else
4176         while (charbuf < charbuf_end)
4177           {
4178             ASSURE_DESTINATION (safe_room);
4179             c = *charbuf++;
4180             EMIT_ONE_BYTE (c);
4181           }
4182     }
4183   else
4184     {
4185       if (coding->src_multibyte)
4186         {
4187           int safe_room = MAX_MULTIBYTE_LENGTH;
4188
4189           while (charbuf < charbuf_end)
4190             {
4191               ASSURE_DESTINATION (safe_room);
4192               c = *charbuf++;
4193               if (ASCII_CHAR_P (c))
4194                 *dst++ = c;
4195               else if (CHAR_BYTE8_P (c))
4196                 *dst++ = CHAR_TO_BYTE8 (c);
4197               else
4198                 CHAR_STRING_ADVANCE (c, dst);
4199               produced_chars++;
4200             }
4201         }
4202       else
4203         {
4204           ASSURE_DESTINATION (charbuf_end - charbuf);
4205           while (charbuf < charbuf_end && dst < dst_end)
4206             *dst++ = *charbuf++;
4207           produced_chars = dst - (coding->destination + coding->dst_bytes);
4208         }
4209     }
4210   coding->result = CODING_RESULT_SUCCESS;
4211   coding->produced_char += produced_chars;
4212   coding->produced = dst - coding->destination;
4213   return 0;
4214 }
4215
4216 static int
4217 detect_coding_charset (coding, mask)
4218      struct coding_system *coding;
4219      int *mask;
4220 {
4221   unsigned char *src = coding->source, *src_base = src;
4222   unsigned char *src_end = coding->source + coding->src_bytes;
4223   int multibytep = coding->src_multibyte;
4224   int consumed_chars = 0;
4225   Lisp_Object attrs, valids;
4226
4227   coding = &coding_categories[coding_category_charset];
4228   attrs = CODING_ID_ATTRS (coding->id);
4229   valids = AREF (attrs, coding_attr_charset_valids);
4230
4231   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4232     src += coding->head_ascii;
4233
4234   while (1)
4235     {
4236       int c;
4237
4238       ONE_MORE_BYTE (c);
4239       if (NILP (AREF (valids, c)))
4240         break;
4241     }
4242   *mask &= ~CATEGORY_MASK_CHARSET;
4243   return 0;
4244
4245  no_more_source:
4246   *mask &= CATEGORY_MASK_CHARSET;
4247   return 1;
4248 }
4249
4250 static void
4251 decode_coding_charset (coding)
4252      struct coding_system *coding;
4253 {
4254   unsigned char *src = coding->source + coding->consumed;
4255   unsigned char *src_end = coding->source + coding->src_bytes;
4256   unsigned char *src_base;
4257   int *charbuf = coding->charbuf;
4258   int *charbuf_end = charbuf + coding->charbuf_size;
4259   int consumed_chars = 0, consumed_chars_base;
4260   int multibytep = coding->src_multibyte;
4261   Lisp_Object attrs, eol_type, charset_list, valids;
4262
4263   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4264   valids = AREF (attrs, coding_attr_charset_valids);
4265
4266   while (1)
4267     {
4268       int c;
4269
4270       src_base = src;
4271       consumed_chars_base = consumed_chars;
4272
4273       if (charbuf >= charbuf_end)
4274         break;
4275
4276       ONE_MORE_BYTE (c);
4277       if (c == '\r')
4278         {
4279           if (EQ (eol_type, Qdos))
4280             {
4281               if (src < src_end
4282                   && *src == '\n')
4283                 ONE_MORE_BYTE (c);
4284             }
4285           else if (EQ (eol_type, Qmac))
4286             c = '\n';
4287         }
4288       else
4289         {
4290           Lisp_Object val;
4291           struct charset *charset;
4292           int c1;
4293
4294           val = AREF (valids, c);
4295           if (NILP (val))
4296             goto invalid_code;
4297           charset = CHARSET_FROM_ID (XFASTINT (val));
4298           if (CHARSET_DIMENSION (charset) > 1)
4299             {
4300               ONE_MORE_BYTE (c1);
4301               c = (c << 8) | c1;
4302               if (CHARSET_DIMENSION (charset) > 2)
4303                 {
4304                   ONE_MORE_BYTE (c1);
4305                   c = (c << 8) | c1;
4306                   if (CHARSET_DIMENSION (charset) > 3)
4307                     {
4308                       ONE_MORE_BYTE (c1);
4309                       c = (c << 8) | c1;
4310                     }
4311                 }
4312             }
4313           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4314           if (c < 0)
4315             goto invalid_code;
4316         }
4317       *charbuf++ = c;
4318       continue;
4319
4320     invalid_code:
4321       src = src_base;
4322       consumed_chars = consumed_chars_base;
4323       ONE_MORE_BYTE (c);
4324       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4325       coding->errors++;
4326     }
4327
4328  no_more_source:
4329   coding->consumed_char += consumed_chars_base;
4330   coding->consumed = src_base - coding->source;
4331   coding->charbuf_used = charbuf - coding->charbuf;
4332 }
4333
4334 static int
4335 encode_coding_charset (coding)
4336      struct coding_system *coding;
4337 {
4338   int multibytep = coding->dst_multibyte;
4339   int *charbuf = coding->charbuf;
4340   int *charbuf_end = charbuf + coding->charbuf_used;
4341   unsigned char *dst = coding->destination + coding->produced;
4342   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4343   int safe_room = MAX_MULTIBYTE_LENGTH;
4344   int produced_chars = 0;
4345   struct charset *charset;
4346   Lisp_Object attrs, eol_type, charset_list;
4347   int ascii_compatible;
4348   int c;
4349
4350   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4351   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4352
4353   while (charbuf < charbuf_end)
4354     {
4355       struct charset *charset;
4356       unsigned code;
4357
4358       ASSURE_DESTINATION (safe_room);
4359       c = *charbuf++;
4360       if (ascii_compatible && ASCII_CHAR_P (c))
4361         EMIT_ONE_ASCII_BYTE (c);
4362       else
4363         {
4364           charset = char_charset (c, charset_list, &code);
4365           if (charset)
4366             {
4367               if (CHARSET_DIMENSION (charset) == 1)
4368                 EMIT_ONE_BYTE (code);
4369               else if (CHARSET_DIMENSION (charset) == 2)
4370                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4371               else if (CHARSET_DIMENSION (charset) == 3)
4372                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4373               else
4374                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4375                                  (code >> 8) & 0xFF, code & 0xFF);
4376             }
4377           else
4378             EMIT_ONE_BYTE (coding->default_char);
4379         }
4380     }
4381
4382   coding->result = CODING_RESULT_SUCCESS;
4383   coding->produced_char += produced_chars;
4384   coding->produced = dst - coding->destination;
4385   return 0;
4386 }
4387
4388 \f
4389 /*** 7. C library functions ***/
4390
4391 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4392    has a property `coding-system'.  The value of this property is a
4393    vector of length 5 (called as coding-vector).  Among elements of
4394    this vector, the first (element[0]) and the fifth (element[4])
4395    carry important information for decoding/encoding.  Before
4396    decoding/encoding, this information should be set in fields of a
4397    structure of type `coding_system'.
4398
4399    A value of property `coding-system' can be a symbol of another
4400    subsidiary coding-system.  In that case, Emacs gets coding-vector
4401    from that symbol.
4402
4403    `element[0]' contains information to be set in `coding->type'.  The
4404    value and its meaning is as follows:
4405
4406    0 -- coding_type_emacs_mule
4407    1 -- coding_type_sjis
4408    2 -- coding_type_iso_2022
4409    3 -- coding_type_big5
4410    4 -- coding_type_ccl encoder/decoder written in CCL
4411    nil -- coding_type_no_conversion
4412    t -- coding_type_undecided (automatic conversion on decoding,
4413                                no-conversion on encoding)
4414
4415    `element[4]' contains information to be set in `coding->flags' and
4416    `coding->spec'.  The meaning varies by `coding->type'.
4417
4418    If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4419    of length 32 (of which the first 13 sub-elements are used now).
4420    Meanings of these sub-elements are:
4421
4422    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4423         If the value is an integer of valid charset, the charset is
4424         assumed to be designated to graphic register N initially.
4425
4426         If the value is minus, it is a minus value of charset which
4427         reserves graphic register N, which means that the charset is
4428         not designated initially but should be designated to graphic
4429         register N just before encoding a character in that charset.
4430
4431         If the value is nil, graphic register N is never used on
4432         encoding.
4433
4434    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4435         Each value takes t or nil.  See the section ISO2022 of
4436         `coding.h' for more information.
4437
4438    If `coding->type' is `coding_type_big5', element[4] is t to denote
4439    BIG5-ETen or nil to denote BIG5-HKU.
4440
4441    If `coding->type' takes the other value, element[4] is ignored.
4442
4443    Emacs Lisp's coding system also carries information about format of
4444    end-of-line in a value of property `eol-type'.  If the value is
4445    integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr.  If
4446    it is not integer, it should be a vector of subsidiary coding
4447    systems of which property `eol-type' has one of above values.
4448
4449 */
4450
4451 /* Setup coding context CODING from information about CODING_SYSTEM.
4452    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4453    CODING_SYSTEM is invalid, signal an error.  */
4454
4455 void
4456 setup_coding_system (coding_system, coding)
4457      Lisp_Object coding_system;
4458      struct coding_system *coding;
4459 {
4460   Lisp_Object attrs;
4461   Lisp_Object eol_type;
4462   Lisp_Object coding_type;
4463   Lisp_Object val;
4464
4465   if (NILP (coding_system))
4466     coding_system = Qno_conversion;
4467
4468   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4469
4470   attrs = CODING_ID_ATTRS (coding->id);
4471   eol_type = CODING_ID_EOL_TYPE (coding->id);
4472
4473   coding->mode = 0;
4474   coding->head_ascii = -1;
4475   coding->common_flags
4476     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4477
4478   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4479   coding->max_charset_id = XSTRING (val)->size - 1;
4480   coding->safe_charsets = (char *) XSTRING (val)->data;
4481   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4482
4483   coding_type = CODING_ATTR_TYPE (attrs);
4484   if (EQ (coding_type, Qundecided))
4485     {
4486       coding->detector = NULL;
4487       coding->decoder = decode_coding_raw_text;
4488       coding->encoder = encode_coding_raw_text;
4489       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4490     }
4491   else if (EQ (coding_type, Qiso_2022))
4492     {
4493       int i;
4494       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4495
4496       /* Invoke graphic register 0 to plane 0.  */
4497       CODING_ISO_INVOCATION (coding, 0) = 0;
4498       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4499       CODING_ISO_INVOCATION (coding, 1)
4500         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4501       /* Setup the initial status of designation.  */
4502       for (i = 0; i < 4; i++)
4503         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4504       /* Not single shifting initially.  */
4505       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4506       /* Beginning of buffer should also be regarded as bol. */
4507       CODING_ISO_BOL (coding) = 1;
4508       coding->detector = detect_coding_iso_2022;
4509       coding->decoder = decode_coding_iso_2022;
4510       coding->encoder = encode_coding_iso_2022;
4511       if (flags & CODING_ISO_FLAG_SAFE)
4512         coding->mode |= CODING_MODE_SAFE_ENCODING;
4513       coding->common_flags
4514         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4515             | CODING_REQUIRE_FLUSHING_MASK);
4516       if (flags & CODING_ISO_FLAG_COMPOSITION)
4517         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4518       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4519         {
4520           setup_iso_safe_charsets (attrs);
4521           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4522           coding->max_charset_id = XSTRING (val)->size - 1;
4523           coding->safe_charsets = (char *) XSTRING (val)->data;
4524         }
4525       CODING_ISO_FLAGS (coding) = flags;
4526     }
4527   else if (EQ (coding_type, Qcharset))
4528     {
4529       coding->detector = detect_coding_charset;
4530       coding->decoder = decode_coding_charset;
4531       coding->encoder = encode_coding_charset;
4532       coding->common_flags
4533         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4534     }
4535   else if (EQ (coding_type, Qutf_8))
4536     {
4537       coding->detector = detect_coding_utf_8;
4538       coding->decoder = decode_coding_utf_8;
4539       coding->encoder = encode_coding_utf_8;
4540       coding->common_flags
4541         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4542     }
4543   else if (EQ (coding_type, Qutf_16))
4544     {
4545       val = AREF (attrs, coding_attr_utf_16_bom);
4546       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4547                                     : EQ (val, Qt) ? utf_16_with_bom
4548                                     : utf_16_without_bom);
4549       val = AREF (attrs, coding_attr_utf_16_endian);
4550       CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4551                                        : utf_16_little_endian);
4552       CODING_UTF_16_SURROGATE (coding) = 0;
4553       coding->detector = detect_coding_utf_16;
4554       coding->decoder = decode_coding_utf_16;
4555       coding->encoder = encode_coding_utf_16;
4556       coding->common_flags
4557         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4558     }
4559   else if (EQ (coding_type, Qccl))
4560     {
4561       coding->detector = detect_coding_ccl;
4562       coding->decoder = decode_coding_ccl;
4563       coding->encoder = encode_coding_ccl;
4564       coding->common_flags
4565         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4566             | CODING_REQUIRE_FLUSHING_MASK);
4567     }
4568   else if (EQ (coding_type, Qemacs_mule))
4569     {
4570       coding->detector = detect_coding_emacs_mule;
4571       coding->decoder = decode_coding_emacs_mule;
4572       coding->encoder = encode_coding_emacs_mule;
4573       coding->common_flags
4574         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4575       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4576           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4577         {
4578           Lisp_Object tail, safe_charsets;
4579           int max_charset_id = 0;
4580
4581           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4582                tail = XCDR (tail))
4583             if (max_charset_id < XFASTINT (XCAR (tail)))
4584               max_charset_id = XFASTINT (XCAR (tail));
4585           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4586                                         make_number (255));
4587           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4588                tail = XCDR (tail))
4589             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4590           coding->max_charset_id = max_charset_id;
4591           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4592         }
4593     }
4594   else if (EQ (coding_type, Qshift_jis))
4595     {
4596       coding->detector = detect_coding_sjis;
4597       coding->decoder = decode_coding_sjis;
4598       coding->encoder = encode_coding_sjis;
4599       coding->common_flags
4600         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4601     }
4602   else if (EQ (coding_type, Qbig5))
4603     {
4604       coding->detector = detect_coding_big5;
4605       coding->decoder = decode_coding_big5;
4606       coding->encoder = encode_coding_big5;
4607       coding->common_flags
4608         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4609     }
4610   else                          /* EQ (coding_type, Qraw_text) */
4611     {
4612       coding->detector = NULL;
4613       coding->decoder = decode_coding_raw_text;
4614       coding->encoder = encode_coding_raw_text;
4615       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4616     }
4617
4618   return;
4619 }
4620
4621 /* Return raw-text or one of its subsidiaries that has the same
4622    eol_type as CODING-SYSTEM.  */
4623
4624 Lisp_Object
4625 raw_text_coding_system (coding_system)
4626      Lisp_Object coding_system;
4627 {
4628   Lisp_Object spec, attrs;
4629   Lisp_Object eol_type, raw_text_eol_type;
4630
4631   spec = CODING_SYSTEM_SPEC (coding_system);
4632   attrs = AREF (spec, 0);
4633
4634   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4635     return coding_system;
4636
4637   eol_type = AREF (spec, 2);
4638   if (VECTORP (eol_type))
4639     return Qraw_text;
4640   spec = CODING_SYSTEM_SPEC (Qraw_text);
4641   raw_text_eol_type = AREF (spec, 2);
4642   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4643           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4644           : AREF (raw_text_eol_type, 2));
4645 }
4646
4647
4648 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4649    does, return one of the subsidiary that has the same eol-spec as
4650    PARENT.  Otherwise, return CODING_SYSTEM.  */
4651
4652 Lisp_Object
4653 coding_inherit_eol_type (coding_system, parent)
4654 {
4655   Lisp_Object spec, attrs, eol_type;
4656
4657   spec = CODING_SYSTEM_SPEC (coding_system);
4658   attrs = AREF (spec, 0);
4659   eol_type = AREF (spec, 2);
4660   if (VECTORP (eol_type))
4661     {
4662       Lisp_Object parent_spec;
4663       Lisp_Object parent_eol_type;
4664
4665       parent_spec
4666         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4667       parent_eol_type = AREF (parent_spec, 2);
4668       if (EQ (parent_eol_type, Qunix))
4669         coding_system = AREF (eol_type, 0);
4670       else if (EQ (parent_eol_type, Qdos))
4671         coding_system = AREF (eol_type, 1);
4672       else if (EQ (parent_eol_type, Qmac))
4673         coding_system = AREF (eol_type, 2);
4674     }
4675   return coding_system;
4676 }
4677
4678 /* Emacs has a mechanism to automatically detect a coding system if it
4679    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4680    it's impossible to distinguish some coding systems accurately
4681    because they use the same range of codes.  So, at first, coding
4682    systems are categorized into 7, those are:
4683
4684    o coding-category-emacs-mule
4685
4686         The category for a coding system which has the same code range
4687         as Emacs' internal format.  Assigned the coding-system (Lisp
4688         symbol) `emacs-mule' by default.
4689
4690    o coding-category-sjis
4691
4692         The category for a coding system which has the same code range
4693         as SJIS.  Assigned the coding-system (Lisp
4694         symbol) `japanese-shift-jis' by default.
4695
4696    o coding-category-iso-7
4697
4698         The category for a coding system which has the same code range
4699         as ISO2022 of 7-bit environment.  This doesn't use any locking
4700         shift and single shift functions.  This can encode/decode all
4701         charsets.  Assigned the coding-system (Lisp symbol)
4702         `iso-2022-7bit' by default.
4703
4704    o coding-category-iso-7-tight
4705
4706         Same as coding-category-iso-7 except that this can
4707         encode/decode only the specified charsets.
4708
4709    o coding-category-iso-8-1
4710
4711         The category for a coding system which has the same code range
4712         as ISO2022 of 8-bit environment and graphic plane 1 used only
4713         for DIMENSION1 charset.  This doesn't use any locking shift
4714         and single shift functions.  Assigned the coding-system (Lisp
4715         symbol) `iso-latin-1' by default.
4716
4717    o coding-category-iso-8-2
4718
4719         The category for a coding system which has the same code range
4720         as ISO2022 of 8-bit environment and graphic plane 1 used only
4721         for DIMENSION2 charset.  This doesn't use any locking shift
4722         and single shift functions.  Assigned the coding-system (Lisp
4723         symbol) `japanese-iso-8bit' by default.
4724
4725    o coding-category-iso-7-else
4726
4727         The category for a coding system which has the same code range
4728         as ISO2022 of 7-bit environemnt but uses locking shift or
4729         single shift functions.  Assigned the coding-system (Lisp
4730         symbol) `iso-2022-7bit-lock' by default.
4731
4732    o coding-category-iso-8-else
4733
4734         The category for a coding system which has the same code range
4735         as ISO2022 of 8-bit environemnt but uses locking shift or
4736         single shift functions.  Assigned the coding-system (Lisp
4737         symbol) `iso-2022-8bit-ss2' by default.
4738
4739    o coding-category-big5
4740
4741         The category for a coding system which has the same code range
4742         as BIG5.  Assigned the coding-system (Lisp symbol)
4743         `cn-big5' by default.
4744
4745    o coding-category-utf-8
4746
4747         The category for a coding system which has the same code range
4748         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4749         symbol) `utf-8' by default.
4750
4751    o coding-category-utf-16-be
4752
4753         The category for a coding system in which a text has an
4754         Unicode signature (cf. Unicode Standard) in the order of BIG
4755         endian at the head.  Assigned the coding-system (Lisp symbol)
4756         `utf-16-be' by default.
4757
4758    o coding-category-utf-16-le
4759
4760         The category for a coding system in which a text has an
4761         Unicode signature (cf. Unicode Standard) in the order of
4762         LITTLE endian at the head.  Assigned the coding-system (Lisp
4763         symbol) `utf-16-le' by default.
4764
4765    o coding-category-ccl
4766
4767         The category for a coding system of which encoder/decoder is
4768         written in CCL programs.  The default value is nil, i.e., no
4769         coding system is assigned.
4770
4771    o coding-category-binary
4772
4773         The category for a coding system not categorized in any of the
4774         above.  Assigned the coding-system (Lisp symbol)
4775         `no-conversion' by default.
4776
4777    Each of them is a Lisp symbol and the value is an actual
4778    `coding-system's (this is also a Lisp symbol) assigned by a user.
4779    What Emacs does actually is to detect a category of coding system.
4780    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4781    decide only one possible category, it selects a category of the
4782    highest priority.  Priorities of categories are also specified by a
4783    user in a Lisp variable `coding-category-list'.
4784
4785 */
4786
4787 #define EOL_SEEN_NONE   0
4788 #define EOL_SEEN_LF     1
4789 #define EOL_SEEN_CR     2
4790 #define EOL_SEEN_CRLF   4
4791
4792 /* Detect how end-of-line of a text of length CODING->src_bytes
4793    pointed by CODING->source is encoded.  Return one of
4794    EOL_SEEN_XXX.  */
4795
4796 #define MAX_EOL_CHECK_COUNT 3
4797
4798 static int
4799 detect_eol (coding, source, src_bytes)
4800      struct coding_system *coding;
4801      unsigned char *source;
4802      EMACS_INT src_bytes;
4803 {
4804   Lisp_Object attrs, coding_type;
4805   unsigned char *src = source, *src_end = src + src_bytes;
4806   unsigned char c;
4807   int total  = 0;
4808   int eol_seen = EOL_SEEN_NONE;
4809
4810   attrs = CODING_ID_ATTRS (coding->id);
4811   coding_type = CODING_ATTR_TYPE (attrs);
4812
4813   if (EQ (coding_type, Qccl))
4814     {
4815       int msb, lsb;
4816
4817       msb = coding->spec.utf_16.endian == utf_16_little_endian;
4818       lsb = 1 - msb;
4819
4820       while (src + 1 < src_end)
4821         {
4822           c = src[lsb];
4823           if (src[msb] == 0 && (c == '\n' || c == '\r'))
4824             {
4825               int this_eol;
4826
4827               if (c == '\n')
4828                 this_eol = EOL_SEEN_LF;
4829               else if (src + 3 >= src_end
4830                        || src[msb + 2] != 0
4831                        || src[lsb + 2] != '\n')
4832                 this_eol = EOL_SEEN_CR;
4833               else
4834                 this_eol = EOL_SEEN_CRLF;
4835
4836               if (eol_seen == EOL_SEEN_NONE)
4837                 /* This is the first end-of-line.  */
4838                 eol_seen = this_eol;
4839               else if (eol_seen != this_eol)
4840                 {
4841                   /* The found type is different from what found before.  */
4842                   eol_seen = EOL_SEEN_LF;
4843                   break;
4844                 }
4845               if (++total == MAX_EOL_CHECK_COUNT)
4846                 break;
4847             }
4848           src += 2;
4849         }
4850     }
4851   else
4852     {
4853       while (src < src_end)
4854         {
4855           c = *src++;
4856           if (c == '\n' || c == '\r')
4857             {
4858               int this_eol;
4859
4860               if (c == '\n')
4861                 this_eol = EOL_SEEN_LF;
4862               else if (src >= src_end || *src != '\n')
4863                 this_eol = EOL_SEEN_CR;
4864               else
4865                 this_eol = EOL_SEEN_CRLF, src++;
4866
4867               if (eol_seen == EOL_SEEN_NONE)
4868                 /* This is the first end-of-line.  */
4869                 eol_seen = this_eol;
4870               else if (eol_seen != this_eol)
4871                 {
4872                   /* The found type is different from what found before.  */
4873                   eol_seen = EOL_SEEN_LF;
4874                   break;
4875                 }
4876               if (++total == MAX_EOL_CHECK_COUNT)
4877                 break;
4878             }
4879         }
4880     }
4881   return eol_seen;
4882 }
4883
4884
4885 static void
4886 adjust_coding_eol_type (coding, eol_seen)
4887      struct coding_system *coding;
4888      int eol_seen;
4889 {
4890   Lisp_Object eol_type;
4891
4892   eol_type = CODING_ID_EOL_TYPE (coding->id);
4893   if (eol_seen & EOL_SEEN_LF)
4894     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
4895   else if (eol_type & EOL_SEEN_CRLF)
4896     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
4897   else if (eol_type & EOL_SEEN_CR)
4898     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
4899 }
4900
4901 /* Detect how a text specified in CODING is encoded.  If a coding
4902    system is detected, update fields of CODING by the detected coding
4903    system.  */
4904
4905 void
4906 detect_coding (coding)
4907      struct coding_system *coding;
4908 {
4909   unsigned char *src, *src_end;
4910   Lisp_Object attrs, coding_type;
4911
4912   coding->consumed = coding->consumed_char = 0;
4913   coding->produced = coding->produced_char = 0;
4914   coding_set_source (coding);
4915
4916   src_end = coding->source + coding->src_bytes;
4917
4918   /* If we have not yet decided the text encoding type, detect it
4919      now.  */
4920   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
4921     {
4922       int mask = CATEGORY_MASK_ANY;
4923       int c, i;
4924
4925       for (src = coding->source; src < src_end; src++)
4926         {
4927           c = *src;
4928           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
4929                                         || c == ISO_CODE_SI
4930                                         || c == ISO_CODE_SO)))
4931             break;
4932         }
4933       coding->head_ascii = src - (coding->source + coding->consumed);
4934
4935       if (coding->head_ascii < coding->src_bytes)
4936         {
4937           int detected = 0;
4938
4939           for (i = 0; i < coding_category_raw_text; i++)
4940             {
4941               enum coding_category category = coding_priorities[i];
4942               struct coding_system *this = coding_categories + category;
4943
4944               if (category >= coding_category_raw_text
4945                   || detected & (1 << category))
4946                 continue;
4947
4948               if (this->id < 0)
4949                 {
4950                   /* No coding system of this category is defined.  */
4951                   mask &= ~(1 << category);
4952                 }
4953               else
4954                 {
4955                   detected |= detected_mask[category];
4956                   if ((*(this->detector)) (coding, &mask))
4957                     break;
4958                 }
4959             }
4960           if (! mask)
4961             setup_coding_system (Qraw_text, coding);
4962           else if (mask != CATEGORY_MASK_ANY)
4963             for (i = 0; i < coding_category_raw_text; i++)
4964               {
4965                 enum coding_category category = coding_priorities[i];
4966                 struct coding_system *this = coding_categories + category;
4967
4968                 if (mask & (1 << category))
4969                   {
4970                     setup_coding_system (CODING_ID_NAME (this->id), coding);
4971                     break;
4972                   }
4973               }
4974         }
4975     }
4976
4977   attrs = CODING_ID_ATTRS (coding->id);
4978   coding_type = CODING_ATTR_TYPE (attrs);
4979
4980   /* If we have not yet decided the EOL type, detect it now.  But, the
4981      detection is impossible for a CCL based coding system, in which
4982      case, we detct the EOL type after decoding.  */
4983   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
4984       && ! EQ (coding_type, Qccl))
4985     {
4986       int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
4987
4988       if (eol_seen != EOL_SEEN_NONE)
4989         adjust_coding_eol_type (coding, eol_seen);
4990     }
4991 }
4992
4993
4994 static void
4995 decode_eol (coding)
4996      struct coding_system *coding;
4997 {
4998   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
4999     {
5000       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5001       unsigned char *pend = p + coding->produced;
5002       int eol_seen = EOL_SEEN_NONE;
5003
5004       for (; p < pend; p++)
5005         {
5006           if (*p == '\n')
5007             eol_seen |= EOL_SEEN_LF;
5008           else if (*p == '\r')
5009             {
5010               if (p + 1 < pend && *(p + 1) == '\n')
5011                 {
5012                   eol_seen |= EOL_SEEN_CRLF;
5013                   p++;
5014                 }
5015               else
5016                 eol_seen |= EOL_SEEN_CR;
5017             }
5018         }
5019       if (eol_seen != EOL_SEEN_NONE)
5020         adjust_coding_eol_type (coding, eol_seen);
5021     }
5022
5023   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5024     {
5025       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5026       unsigned char *pend = p + coding->produced;
5027
5028       for (; p < pend; p++)
5029         if (*p == '\r')
5030           *p = '\n';
5031     }
5032   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5033     {
5034       unsigned char *p, *pbeg, *pend;
5035       Lisp_Object undo_list;
5036
5037       move_gap_both (coding->dst_pos + coding->produced_char,
5038                      coding->dst_pos_byte + coding->produced);
5039       undo_list = current_buffer->undo_list;
5040       current_buffer->undo_list = Qt;
5041       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5042       current_buffer->undo_list = undo_list;
5043       pbeg = GPT_ADDR;
5044       pend = pbeg + coding->produced;
5045
5046       for (p = pend - 1; p >= pbeg; p--)
5047         if (*p == '\r')
5048           {
5049             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5050             pend--;
5051           }
5052       coding->produced_char -= coding->produced - (pend - pbeg);
5053       coding->produced = pend - pbeg;
5054       insert_from_gap (coding->produced_char, coding->produced);
5055     }
5056 }
5057
5058 static void
5059 translate_chars (coding, table)
5060      struct coding_system *coding;
5061      Lisp_Object table;
5062 {
5063   int *charbuf = coding->charbuf;
5064   int *charbuf_end = charbuf + coding->charbuf_used;
5065   int c;
5066
5067   if (coding->chars_at_source)
5068     return;
5069
5070   while (charbuf < charbuf_end)
5071     {
5072       c = *charbuf;
5073       if (c < 0)
5074         charbuf += c;
5075       else
5076         *charbuf++ = translate_char (table, c);
5077     }
5078 }
5079
5080 static int
5081 produce_chars (coding)
5082      struct coding_system *coding;
5083 {
5084   unsigned char *dst = coding->destination + coding->produced;
5085   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5086   int produced;
5087   int produced_chars = 0;
5088
5089   if (! coding->chars_at_source)
5090     {
5091       /* Characters are in coding->charbuf.  */
5092       int *buf = coding->charbuf;
5093       int *buf_end = buf + coding->charbuf_used;
5094       unsigned char *adjusted_dst_end;
5095
5096       if (BUFFERP (coding->src_object)
5097           && EQ (coding->src_object, coding->dst_object))
5098         dst_end = coding->source + coding->consumed;
5099       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5100
5101       while (buf < buf_end)
5102         {
5103           int c = *buf++;
5104
5105           if (dst >= adjusted_dst_end)
5106             {
5107               dst = alloc_destination (coding,
5108                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5109                                        dst);
5110               dst_end = coding->destination + coding->dst_bytes;
5111               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5112             }
5113           if (c >= 0)
5114             {
5115               if (coding->dst_multibyte
5116                   || ! CHAR_BYTE8_P (c))
5117                 CHAR_STRING_ADVANCE (c, dst);
5118               else
5119                 *dst++ = CHAR_TO_BYTE8 (c);
5120               produced_chars++;
5121             }
5122           else
5123             /* This is an annotation data.  */
5124             buf -= c + 1;
5125         }
5126     }
5127   else
5128     {
5129       unsigned char *src = coding->source;
5130       unsigned char *src_end = src + coding->src_bytes;
5131       Lisp_Object eol_type;
5132
5133       eol_type = CODING_ID_EOL_TYPE (coding->id);
5134
5135       if (coding->src_multibyte != coding->dst_multibyte)
5136         {
5137           if (coding->src_multibyte)
5138             {
5139               int multibytep = 1;
5140               int consumed_chars;
5141
5142               while (1)
5143                 {
5144                   unsigned char *src_base = src;
5145                   int c;
5146
5147                   ONE_MORE_BYTE (c);
5148                   if (c == '\r')
5149                     {
5150                       if (EQ (eol_type, Qdos))
5151                         {
5152                           if (src < src_end
5153                               && *src == '\n')
5154                             c = *src++;
5155                         }
5156                       else if (EQ (eol_type, Qmac))
5157                         c = '\n';
5158                     }
5159                   if (dst == dst_end)
5160                     {
5161                       coding->consumed = src - coding->source;
5162
5163                     if (EQ (coding->src_object, coding->dst_object))
5164                       dst_end = src;
5165                     if (dst == dst_end)
5166                       {
5167                         dst = alloc_destination (coding, src_end - src + 1,
5168                                                  dst);
5169                         dst_end = coding->destination + coding->dst_bytes;
5170                         coding_set_source (coding);
5171                         src = coding->source + coding->consumed;
5172                         src_end = coding->source + coding->src_bytes;
5173                       }
5174                     }
5175                   *dst++ = c;
5176                   produced_chars++;
5177                 }
5178             no_more_source:
5179               ;
5180             }
5181           else
5182             while (src < src_end)
5183               {
5184                 int multibytep = 1;
5185                 int c = *src++;
5186
5187                 if (c == '\r')
5188                   {
5189                     if (EQ (eol_type, Qdos))
5190                       {
5191                         if (src < src_end
5192                             && *src == '\n')
5193                           c = *src++;
5194                       }
5195                     else if (EQ (eol_type, Qmac))
5196                       c = '\n';
5197                   }
5198                 if (dst >= dst_end - 1)
5199                   {
5200                     coding->consumed = src - coding->source;
5201
5202                     if (EQ (coding->src_object, coding->dst_object))
5203                       dst_end = src;
5204                     if (dst >= dst_end - 1)
5205                       {
5206                         dst = alloc_destination (coding, src_end - src + 2,
5207                                                  dst);
5208                         dst_end = coding->destination + coding->dst_bytes;
5209                         coding_set_source (coding);
5210                         src = coding->source + coding->consumed;
5211                         src_end = coding->source + coding->src_bytes;
5212                       }
5213                   }
5214                 EMIT_ONE_BYTE (c);
5215               }
5216         }
5217       else
5218         {
5219           if (!EQ (coding->src_object, coding->dst_object))
5220             {
5221               int require = coding->src_bytes - coding->dst_bytes;
5222
5223               if (require > 0)
5224                 {
5225                   EMACS_INT offset = src - coding->source;
5226
5227                   dst = alloc_destination (coding, require, dst);
5228                   coding_set_source (coding);
5229                   src = coding->source + offset;
5230                   src_end = coding->source + coding->src_bytes;
5231                 }
5232             }
5233           produced_chars = coding->src_chars;
5234           while (src < src_end)
5235             {
5236               int c = *src++;
5237
5238               if (c == '\r')
5239                 {
5240                   if (EQ (eol_type, Qdos))
5241                     {
5242                       if (src < src_end
5243                           && *src == '\n')
5244                         c = *src++;
5245                       produced_chars--;
5246                     }
5247                   else if (EQ (eol_type, Qmac))
5248                     c = '\n';
5249                 }
5250               *dst++ = c;
5251             }
5252         }
5253       coding->consumed = coding->src_bytes;
5254       coding->consumed_char = coding->src_chars;
5255     }
5256
5257   produced = dst - (coding->destination + coding->produced);
5258   if (BUFFERP (coding->dst_object))
5259     insert_from_gap (produced_chars, produced);
5260   coding->produced += produced;
5261   coding->produced_char += produced_chars;
5262   return produced_chars;
5263 }
5264
5265 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5266         or
5267    [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5268  */
5269
5270 static INLINE void
5271 produce_composition (coding, charbuf)
5272      struct coding_system *coding;
5273      int *charbuf;
5274 {
5275   Lisp_Object buffer;
5276   int len;
5277   EMACS_INT pos;
5278   enum composition_method method;
5279   int cmp_len;
5280   Lisp_Object components;
5281
5282   buffer = coding->dst_object;
5283   len = -charbuf[0];
5284   pos = coding->dst_pos + charbuf[1];
5285   method = (enum composition_method) (charbuf[3]);
5286   cmp_len = charbuf[4];
5287
5288   if (method == COMPOSITION_RELATIVE)
5289     components = Qnil;
5290   else
5291     {
5292       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5293       int i;
5294
5295       len -= 5;
5296       charbuf += 5;
5297       for (i = 0; i < len; i++)
5298         args[i] = make_number (charbuf[i]);
5299       components = (method == COMPOSITION_WITH_ALTCHARS
5300                     ? Fstring (len, args) : Fvector (len, args));
5301     }
5302   compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5303 }
5304
5305 static int *
5306 save_composition_data (buf, buf_end, prop)
5307      int *buf, *buf_end;
5308      Lisp_Object prop;
5309 {
5310   enum composition_method method = COMPOSITION_METHOD (prop);
5311   int cmp_len = COMPOSITION_LENGTH (prop);
5312
5313   if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5314     return NULL;
5315
5316   buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5317   buf[2] = method;
5318   buf[3] = cmp_len;
5319
5320   if (method == COMPOSITION_RELATIVE)
5321     buf[0] = 4;
5322   else
5323     {
5324       Lisp_Object components;
5325       int len, i;
5326
5327       components = COMPOSITION_COMPONENTS (prop);
5328       if (VECTORP (components))
5329         {
5330           len = XVECTOR (components)->size;
5331           for (i = 0; i < len; i++)
5332             buf[4 + i] = XINT (AREF (components, i));
5333         }
5334       else if (STRINGP (components))
5335         {
5336           int i_byte;
5337
5338           len = XSTRING (components)->size;
5339           i = i_byte = 0;
5340           while (i < len)
5341             FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5342         }
5343       else if (INTEGERP (components))
5344         {
5345           len = 1;
5346           buf[4] = XINT (components);
5347         }
5348       else if (CONSP (components))
5349         {
5350           for (len = 0; CONSP (components);
5351                len++, components = XCDR (components))
5352             buf[4 + len] = XINT (XCAR (components));
5353         }
5354       else
5355         abort ();
5356       buf[0] = 4 + len;
5357     }
5358   return (buf + buf[0]);
5359 }
5360
5361 #define CHARBUF_SIZE 0x4000
5362
5363 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5364   do {                                                                  \
5365     int size = CHARBUF_SIZE;;                                           \
5366                                                                         \
5367     coding->charbuf = NULL;                                             \
5368     while (size > 1024)                                                 \
5369       {                                                                 \
5370         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5371         if (coding->charbuf)                                            \
5372           break;                                                        \
5373         size >>= 1;                                                     \
5374       }                                                                 \
5375     if (! coding->charbuf)                                              \
5376       {                                                                 \
5377         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5378         return coding->result;                                          \
5379       }                                                                 \
5380     coding->charbuf_size = size;                                        \
5381   } while (0)
5382
5383
5384 static void
5385 produce_annotation (coding)
5386      struct coding_system *coding;
5387 {
5388   int *charbuf = coding->charbuf;
5389   int *charbuf_end = charbuf + coding->charbuf_used;
5390
5391   while (charbuf < charbuf_end)
5392     {
5393       if (*charbuf >= 0)
5394         charbuf++;
5395       else
5396         {
5397           int len = -*charbuf;
5398           switch (charbuf[2])
5399             {
5400             case CODING_ANNOTATE_COMPOSITION_MASK:
5401               produce_composition (coding, charbuf);
5402               break;
5403             default:
5404               abort ();
5405             }
5406           charbuf += len;
5407         }
5408     }
5409 }
5410
5411 /* Decode the data at CODING->src_object into CODING->dst_object.
5412    CODING->src_object is a buffer, a string, or nil.
5413    CODING->dst_object is a buffer.
5414
5415    If CODING->src_object is a buffer, it must be the current buffer.
5416    In this case, if CODING->src_pos is positive, it is a position of
5417    the source text in the buffer, otherwise, the source text is in the
5418    gap area of the buffer, and CODING->src_pos specifies the offset of
5419    the text from GPT (which must be the same as PT).  If this is the
5420    same buffer as CODING->dst_object, CODING->src_pos must be
5421    negative.
5422
5423    If CODING->src_object is a string, CODING->src_pos in an index to
5424    that string.
5425
5426    If CODING->src_object is nil, CODING->source must already point to
5427    the non-relocatable memory area.  In this case, CODING->src_pos is
5428    an offset from CODING->source.
5429
5430    The decoded data is inserted at the current point of the buffer
5431    CODING->dst_object.
5432 */
5433
5434 static int
5435 decode_coding (coding)
5436      struct coding_system *coding;
5437 {
5438   Lisp_Object attrs;
5439
5440   if (BUFFERP (coding->src_object)
5441       && coding->src_pos > 0
5442       && coding->src_pos < GPT
5443       && coding->src_pos + coding->src_chars > GPT)
5444     move_gap_both (coding->src_pos, coding->src_pos_byte);
5445
5446   if (BUFFERP (coding->dst_object))
5447     {
5448       if (current_buffer != XBUFFER (coding->dst_object))
5449         set_buffer_internal (XBUFFER (coding->dst_object));
5450       if (GPT != PT)
5451         move_gap_both (PT, PT_BYTE);
5452     }
5453
5454   coding->consumed = coding->consumed_char = 0;
5455   coding->produced = coding->produced_char = 0;
5456   coding->chars_at_source = 0;
5457   coding->result = CODING_RESULT_SUCCESS;
5458   coding->errors = 0;
5459
5460   ALLOC_CONVERSION_WORK_AREA (coding);
5461
5462   attrs = CODING_ID_ATTRS (coding->id);
5463
5464   do
5465     {
5466       coding_set_source (coding);
5467       coding->annotated = 0;
5468       (*(coding->decoder)) (coding);
5469       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5470         translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5471       coding_set_destination (coding);
5472       produce_chars (coding);
5473       if (coding->annotated)
5474         produce_annotation (coding);
5475     }
5476   while (coding->consumed < coding->src_bytes
5477          && ! coding->result);
5478
5479   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5480       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5481       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5482     decode_eol (coding);
5483
5484   coding->carryover_bytes = 0;
5485   if (coding->consumed < coding->src_bytes)
5486     {
5487       int nbytes = coding->src_bytes - coding->consumed;
5488       unsigned char *src;
5489
5490       coding_set_source (coding);
5491       coding_set_destination (coding);
5492       src = coding->source + coding->consumed;
5493
5494       if (coding->mode & CODING_MODE_LAST_BLOCK)
5495         {
5496           /* Flush out unprocessed data as binary chars.  We are sure
5497              that the number of data is less than the size of
5498              coding->charbuf.  */
5499           int *charbuf = coding->charbuf;
5500
5501           while (nbytes-- > 0)
5502             {
5503               int c = *src++;
5504               *charbuf++ =  (c & 0x80 ? - c : c);
5505             }
5506           produce_chars (coding);
5507         }
5508       else
5509         {
5510           /* Record unprocessed bytes in coding->carryover.  We are
5511              sure that the number of data is less than the size of
5512              coding->carryover.  */
5513           unsigned char *p = coding->carryover;
5514
5515           coding->carryover_bytes = nbytes;
5516           while (nbytes-- > 0)
5517             *p++ = *src++;
5518         }
5519       coding->consumed = coding->src_bytes;
5520     }
5521
5522   return coding->result;
5523 }
5524
5525 static void
5526 consume_chars (coding)
5527      struct coding_system *coding;
5528 {
5529   int *buf = coding->charbuf;
5530   /* -1 is to compensate for CRLF.  */
5531   int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5532   unsigned char *src = coding->source + coding->consumed;
5533   int pos = coding->src_pos + coding->consumed_char;
5534   int end_pos = coding->src_pos + coding->src_chars;
5535   int multibytep = coding->src_multibyte;
5536   Lisp_Object eol_type;
5537   int c;
5538   int start, end, stop;
5539   Lisp_Object object, prop;
5540
5541   eol_type = CODING_ID_EOL_TYPE (coding->id);
5542   if (VECTORP (eol_type))
5543     eol_type = Qunix;
5544
5545   object = coding->src_object;
5546
5547   /* Note: composition handling is not yet implemented.  */
5548   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5549
5550   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5551       && find_composition (pos, end_pos, &start, &end, &prop, object)
5552       && end <= end_pos
5553       && (start >= pos
5554           || (find_composition (end, end_pos, &start, &end, &prop, object)
5555               && end <= end_pos)))
5556     stop = start;
5557   else
5558     stop = end_pos;
5559
5560   while (buf < buf_end)
5561     {
5562       if (pos == stop)
5563         {
5564           int *p;
5565
5566           if (pos == end_pos)
5567             break;
5568           p = save_composition_data (buf, buf_end, prop);
5569           if (p == NULL)
5570             break;
5571           buf = p;
5572           if (find_composition (end, end_pos, &start, &end, &prop, object)
5573               && end <= end_pos)
5574             stop = start;
5575           else
5576             stop = end_pos;
5577         }
5578
5579       if (! multibytep)
5580         c = *src++;
5581       else
5582         c = STRING_CHAR_ADVANCE (src);
5583       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5584         c = '\n';
5585       if (! EQ (eol_type, Qunix))
5586         {
5587           if (c == '\n')
5588             {
5589               if (EQ (eol_type, Qdos))
5590                 *buf++ = '\r';
5591               else
5592                 c = '\r';
5593             }
5594         }
5595       *buf++ = c;
5596       pos++;
5597     }
5598
5599   coding->consumed = src - coding->source;
5600   coding->consumed_char = pos - coding->src_pos;
5601   coding->charbuf_used = buf - coding->charbuf;
5602   coding->chars_at_source = 0;
5603 }
5604
5605
5606 /* Encode the text at CODING->src_object into CODING->dst_object.
5607    CODING->src_object is a buffer or a string.
5608    CODING->dst_object is a buffer or nil.
5609
5610    If CODING->src_object is a buffer, it must be the current buffer.
5611    In this case, if CODING->src_pos is positive, it is a position of
5612    the source text in the buffer, otherwise. the source text is in the
5613    gap area of the buffer, and coding->src_pos specifies the offset of
5614    the text from GPT (which must be the same as PT).  If this is the
5615    same buffer as CODING->dst_object, CODING->src_pos must be
5616    negative and CODING should not have `pre-write-conversion'.
5617
5618    If CODING->src_object is a string, CODING should not have
5619    `pre-write-conversion'.
5620
5621    If CODING->dst_object is a buffer, the encoded data is inserted at
5622    the current point of that buffer.
5623
5624    If CODING->dst_object is nil, the encoded data is placed at the
5625    memory area specified by CODING->destination.  */
5626
5627 static int
5628 encode_coding (coding)
5629      struct coding_system *coding;
5630 {
5631   Lisp_Object attrs;
5632
5633   attrs = CODING_ID_ATTRS (coding->id);
5634
5635   if (BUFFERP (coding->dst_object))
5636     {
5637       set_buffer_internal (XBUFFER (coding->dst_object));
5638       coding->dst_multibyte
5639         = ! NILP (current_buffer->enable_multibyte_characters);
5640     }
5641
5642   coding->consumed = coding->consumed_char = 0;
5643   coding->produced = coding->produced_char = 0;
5644   coding->result = CODING_RESULT_SUCCESS;
5645   coding->errors = 0;
5646
5647   ALLOC_CONVERSION_WORK_AREA (coding);
5648
5649   do {
5650     coding_set_source (coding);
5651     consume_chars (coding);
5652
5653     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5654       translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5655
5656     coding_set_destination (coding);
5657     (*(coding->encoder)) (coding);
5658   } while (coding->consumed_char < coding->src_chars);
5659
5660   if (BUFFERP (coding->dst_object))
5661     insert_from_gap (coding->produced_char, coding->produced);
5662
5663   return (coding->result);
5664 }
5665
5666 /* Work buffer */
5667
5668 /* List of currently used working buffer.  */
5669 Lisp_Object Vcode_conversion_work_buf_list;
5670
5671 /* A working buffer used by the top level conversion.  */
5672 Lisp_Object Vcode_conversion_reused_work_buf;
5673
5674
5675 /* Return a working buffer that can be freely used by the following
5676    code conversion.  MULTIBYTEP specifies the multibyteness of the
5677    buffer.  */
5678
5679 Lisp_Object
5680 make_conversion_work_buffer (multibytep)
5681      int multibytep;
5682 {
5683   struct buffer *current = current_buffer;
5684   Lisp_Object buf;
5685
5686   if (NILP (Vcode_conversion_work_buf_list))
5687     {
5688       if (NILP (Vcode_conversion_reused_work_buf))
5689         Vcode_conversion_reused_work_buf
5690           = Fget_buffer_create (build_string (" *code-conversion-work*"));
5691       Vcode_conversion_work_buf_list
5692         = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5693     }
5694   else
5695     {
5696       int depth = Flength (Vcode_conversion_work_buf_list);
5697       char str[128];
5698
5699       sprintf (str, " *code-conversion-work*<%d>", depth);
5700       Vcode_conversion_work_buf_list
5701         = Fcons (Fget_buffer_create (build_string (str)),
5702                  Vcode_conversion_work_buf_list);
5703     }
5704
5705   buf = XCAR (Vcode_conversion_work_buf_list);
5706   set_buffer_internal (XBUFFER (buf));
5707   current_buffer->undo_list = Qt;
5708   Ferase_buffer ();
5709   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5710   set_buffer_internal (current);
5711   return buf;
5712 }
5713
5714 static struct coding_system *saved_coding;
5715
5716 Lisp_Object
5717 code_conversion_restore (info)
5718      Lisp_Object info;
5719 {
5720   int depth = Flength (Vcode_conversion_work_buf_list);
5721   Lisp_Object buf;
5722
5723   if (depth > 0)
5724     {
5725       buf = XCAR (Vcode_conversion_work_buf_list);
5726       Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5727       if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5728         Fkill_buffer (buf);
5729     }
5730
5731   if (saved_coding->dst_object == Qt
5732       && saved_coding->destination)
5733     xfree (saved_coding->destination);
5734
5735   return save_excursion_restore (info);
5736 }
5737
5738
5739 int
5740 decode_coding_gap (coding, chars, bytes)
5741      struct coding_system *coding;
5742      EMACS_INT chars, bytes;
5743 {
5744   int count = specpdl_ptr - specpdl;
5745
5746   saved_coding = coding;
5747   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5748
5749   coding->src_object = Fcurrent_buffer ();
5750   coding->src_chars = chars;
5751   coding->src_bytes = bytes;
5752   coding->src_pos = -chars;
5753   coding->src_pos_byte = -bytes;
5754   coding->src_multibyte = chars < bytes;
5755   coding->dst_object = coding->src_object;
5756   coding->dst_pos = PT;
5757   coding->dst_pos_byte = PT_BYTE;
5758   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5759
5760   if (CODING_REQUIRE_DETECTION (coding))
5761     detect_coding (coding);
5762
5763   decode_coding (coding);
5764
5765   unbind_to (count, Qnil);
5766   return coding->result;
5767 }
5768
5769 int
5770 encode_coding_gap (coding, chars, bytes)
5771      struct coding_system *coding;
5772      EMACS_INT chars, bytes;
5773 {
5774   int count = specpdl_ptr - specpdl;
5775   Lisp_Object buffer;
5776
5777   saved_coding = coding;
5778   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5779
5780   buffer = Fcurrent_buffer ();
5781   coding->src_object = buffer;
5782   coding->src_chars = chars;
5783   coding->src_bytes = bytes;
5784   coding->src_pos = -chars;
5785   coding->src_pos_byte = -bytes;
5786   coding->src_multibyte = chars < bytes;
5787   coding->dst_object = coding->src_object;
5788   coding->dst_pos = PT;
5789   coding->dst_pos_byte = PT_BYTE;
5790
5791   encode_coding (coding);
5792
5793   unbind_to (count, Qnil);
5794   return coding->result;
5795 }
5796
5797
5798 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5799    SRC_OBJECT into DST_OBJECT by coding context CODING.
5800
5801    SRC_OBJECT is a buffer, a string, or Qnil.
5802
5803    If it is a buffer, the text is at point of the buffer.  FROM and TO
5804    are positions in the buffer.
5805
5806    If it is a string, the text is at the beginning of the string.
5807    FROM and TO are indices to the string.
5808
5809    If it is nil, the text is at coding->source.  FROM and TO are
5810    indices to coding->source.
5811
5812    DST_OBJECT is a buffer, Qt, or Qnil.
5813
5814    If it is a buffer, the decoded text is inserted at point of the
5815    buffer.  If the buffer is the same as SRC_OBJECT, the source text
5816    is deleted.
5817
5818    If it is Qt, a string is made from the decoded text, and
5819    set in CODING->dst_object.
5820
5821    If it is Qnil, the decoded text is stored at CODING->destination.
5822    The called must allocate CODING->dst_bytes bytes at
5823    CODING->destination by xmalloc.  If the decoded text is longer than
5824    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5825  */
5826
5827 void
5828 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5829                       dst_object)
5830      struct coding_system *coding;
5831      Lisp_Object src_object;
5832      EMACS_INT from, from_byte, to, to_byte;
5833      Lisp_Object dst_object;
5834 {
5835   int count = specpdl_ptr - specpdl;
5836   unsigned char *destination;
5837   EMACS_INT dst_bytes;
5838   EMACS_INT chars = to - from;
5839   EMACS_INT bytes = to_byte - from_byte;
5840   Lisp_Object attrs;
5841
5842   saved_coding = coding;
5843   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5844
5845   if (NILP (dst_object))
5846     {
5847       destination = coding->destination;
5848       dst_bytes = coding->dst_bytes;
5849     }
5850
5851   coding->src_object = src_object;
5852   coding->src_chars = chars;
5853   coding->src_bytes = bytes;
5854   coding->src_multibyte = chars < bytes;
5855
5856   if (STRINGP (src_object))
5857     {
5858       coding->src_pos = from;
5859       coding->src_pos_byte = from_byte;
5860     }
5861   else if (BUFFERP (src_object))
5862     {
5863       set_buffer_internal (XBUFFER (src_object));
5864       if (from != GPT)
5865         move_gap_both (from, from_byte);
5866       if (EQ (src_object, dst_object))
5867         {
5868           TEMP_SET_PT_BOTH (from, from_byte);
5869           del_range_both (from, from_byte, to, to_byte, 1);
5870           coding->src_pos = -chars;
5871           coding->src_pos_byte = -bytes;
5872         }
5873       else
5874         {
5875           coding->src_pos = from;
5876           coding->src_pos_byte = from_byte;
5877         }
5878     }
5879
5880   if (CODING_REQUIRE_DETECTION (coding))
5881     detect_coding (coding);
5882   attrs = CODING_ID_ATTRS (coding->id);
5883
5884   if (! NILP (CODING_ATTR_POST_READ (attrs))
5885       || EQ (dst_object, Qt))
5886     {
5887       coding->dst_object = make_conversion_work_buffer (1);
5888       coding->dst_pos = BEG;
5889       coding->dst_pos_byte = BEG_BYTE;
5890       coding->dst_multibyte = 1;
5891     }
5892   else if (BUFFERP (dst_object))
5893     {
5894       coding->dst_object = dst_object;
5895       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
5896       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
5897       coding->dst_multibyte
5898         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
5899     }
5900   else
5901     {
5902       coding->dst_object = Qnil;
5903       coding->dst_multibyte = 1;
5904     }
5905
5906   decode_coding (coding);
5907
5908   if (BUFFERP (coding->dst_object))
5909     set_buffer_internal (XBUFFER (coding->dst_object));
5910
5911   if (! NILP (CODING_ATTR_POST_READ (attrs)))
5912     {
5913       struct gcpro gcpro1, gcpro2;
5914       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
5915       Lisp_Object val;
5916
5917       GCPRO2 (coding->src_object, coding->dst_object);
5918       val = call1 (CODING_ATTR_POST_READ (attrs),
5919                    make_number (coding->produced_char));
5920       UNGCPRO;
5921       CHECK_NATNUM (val);
5922       coding->produced_char += Z - prev_Z;
5923       coding->produced += Z_BYTE - prev_Z_BYTE;
5924     }
5925
5926   if (EQ (dst_object, Qt))
5927     {
5928       coding->dst_object = Fbuffer_string ();
5929     }
5930   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
5931     {
5932       set_buffer_internal (XBUFFER (coding->dst_object));
5933       if (dst_bytes < coding->produced)
5934         {
5935           destination
5936             = (unsigned char *) xrealloc (destination, coding->produced);
5937           if (! destination)
5938             {
5939               coding->result = CODING_RESULT_INSUFFICIENT_DST;
5940               unbind_to (count, Qnil);
5941               return;
5942             }
5943           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
5944             move_gap_both (BEGV, BEGV_BYTE);
5945           bcopy (BEGV_ADDR, destination, coding->produced);
5946           coding->destination = destination;
5947         }
5948     }
5949
5950   unbind_to (count, Qnil);
5951 }
5952
5953
5954 void
5955 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5956                       dst_object)
5957      struct coding_system *coding;
5958      Lisp_Object src_object;
5959      EMACS_INT from, from_byte, to, to_byte;
5960      Lisp_Object dst_object;
5961 {
5962   int count = specpdl_ptr - specpdl;
5963   EMACS_INT chars = to - from;
5964   EMACS_INT bytes = to_byte - from_byte;
5965   Lisp_Object attrs;
5966
5967   saved_coding = coding;
5968   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5969
5970   coding->src_object = src_object;
5971   coding->src_chars = chars;
5972   coding->src_bytes = bytes;
5973   coding->src_multibyte = chars < bytes;
5974
5975   attrs = CODING_ID_ATTRS (coding->id);
5976
5977   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
5978     {
5979       Lisp_Object val;
5980
5981       coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
5982       set_buffer_internal (XBUFFER (coding->src_object));
5983       if (STRINGP (src_object))
5984         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
5985       else if (BUFFERP (src_object))
5986         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
5987       else
5988         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
5989
5990       if (EQ (src_object, dst_object))
5991         {
5992           set_buffer_internal (XBUFFER (src_object));
5993           del_range_both (from, from_byte, to, to_byte, 1);
5994           set_buffer_internal (XBUFFER (coding->src_object));
5995         }
5996
5997       val = call2 (CODING_ATTR_PRE_WRITE (attrs),
5998                    make_number (1), make_number (chars));
5999       CHECK_NATNUM (val);
6000       if (BEG != GPT)
6001         move_gap_both (BEG, BEG_BYTE);
6002       coding->src_chars = Z - BEG;
6003       coding->src_bytes = Z_BYTE - BEG_BYTE;
6004       coding->src_pos = BEG;
6005       coding->src_pos_byte = BEG_BYTE;
6006       coding->src_multibyte = Z < Z_BYTE;
6007     }
6008   else if (STRINGP (src_object))
6009     {
6010       coding->src_pos = from;
6011       coding->src_pos_byte = from_byte;
6012     }
6013   else if (BUFFERP (src_object))
6014     {
6015       set_buffer_internal (XBUFFER (src_object));
6016       if (from != GPT)
6017         move_gap_both (from, from_byte);
6018       if (EQ (src_object, dst_object))
6019         {
6020           del_range_both (from, from_byte, to, to_byte, 1);
6021           coding->src_pos = -chars;
6022           coding->src_pos_byte = -bytes;
6023         }
6024       else
6025         {
6026           coding->src_pos = from;
6027           coding->src_pos_byte = from_byte;
6028         }
6029     }
6030
6031   if (BUFFERP (dst_object))
6032     {
6033       coding->dst_object = dst_object;
6034       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6035       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6036       coding->dst_multibyte
6037         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6038     }
6039   else if (EQ (dst_object, Qt))
6040     {
6041       coding->dst_object = Qnil;
6042       coding->destination = (unsigned char *) xmalloc (coding->src_chars);
6043       coding->dst_bytes = coding->src_chars;
6044       coding->dst_multibyte = 0;
6045     }
6046   else
6047     {
6048       coding->dst_object = Qnil;
6049       coding->dst_multibyte = 0;
6050     }
6051
6052   encode_coding (coding);
6053
6054   if (EQ (dst_object, Qt))
6055     {
6056       if (BUFFERP (coding->dst_object))
6057         coding->dst_object = Fbuffer_string ();
6058       else
6059         {
6060           coding->dst_object
6061             = make_unibyte_string ((char *) coding->destination,
6062                                    coding->produced);
6063           xfree (coding->destination);
6064         }
6065     }
6066
6067   unbind_to (count, Qnil);
6068 }
6069
6070
6071 Lisp_Object
6072 preferred_coding_system ()
6073 {
6074   int id = coding_categories[coding_priorities[0]].id;
6075
6076   return CODING_ID_NAME (id);
6077 }
6078
6079 \f
6080 #ifdef emacs
6081 /*** 8. Emacs Lisp library functions ***/
6082
6083 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6084        doc: /* Return t if OBJECT is nil or a coding-system.
6085 See the documentation of `define-coding-system' for information
6086 about coding-system objects.  */)
6087      (obj)
6088      Lisp_Object obj;
6089 {
6090   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6091 }
6092
6093 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6094        Sread_non_nil_coding_system, 1, 1, 0,
6095        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6096      (prompt)
6097      Lisp_Object prompt;
6098 {
6099   Lisp_Object val;
6100   do
6101     {
6102       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6103                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6104     }
6105   while (XSTRING (val)->size == 0);
6106   return (Fintern (val, Qnil));
6107 }
6108
6109 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6110        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6111 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6112      (prompt, default_coding_system)
6113      Lisp_Object prompt, default_coding_system;
6114 {
6115   Lisp_Object val;
6116   if (SYMBOLP (default_coding_system))
6117     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6118   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6119                           Qt, Qnil, Qcoding_system_history,
6120                           default_coding_system, Qnil);
6121   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6122 }
6123
6124 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6125        1, 1, 0,
6126        doc: /* Check validity of CODING-SYSTEM.
6127 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6128 It is valid if it is a symbol with a non-nil `coding-system' property.
6129 The value of property should be a vector of length 5.  */)
6130   (coding_system)
6131      Lisp_Object coding_system;
6132 {
6133   CHECK_SYMBOL (coding_system);
6134   if (!NILP (Fcoding_system_p (coding_system)))
6135     return coding_system;
6136   while (1)
6137     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6138 }
6139
6140 \f
6141 Lisp_Object
6142 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6143      unsigned char *src;
6144      int src_bytes, highest;
6145      int multibytep;
6146      Lisp_Object coding_system;
6147 {
6148   unsigned char *src_end = src + src_bytes;
6149   int mask = CATEGORY_MASK_ANY;
6150   int detected = 0;
6151   int c, i;
6152   Lisp_Object attrs, eol_type;
6153   Lisp_Object val;
6154   struct coding_system coding;
6155
6156   if (NILP (coding_system))
6157     coding_system = Qundecided;
6158   setup_coding_system (coding_system, &coding);
6159   attrs = CODING_ID_ATTRS (coding.id);
6160   eol_type = CODING_ID_EOL_TYPE (coding.id);
6161
6162   coding.source = src;
6163   coding.src_bytes = src_bytes;
6164   coding.src_multibyte = multibytep;
6165   coding.consumed = 0;
6166
6167   if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6168     {
6169       mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6170     }
6171   else
6172     {
6173       coding_system = Qnil;
6174       for (; src < src_end; src++)
6175         {
6176           c = *src;
6177           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6178                                         || c == ISO_CODE_SI
6179                                         || c == ISO_CODE_SO)))
6180             break;
6181         }
6182       coding.head_ascii = src - coding.source;
6183
6184       if (src < src_end)
6185         for (i = 0; i < coding_category_raw_text; i++)
6186           {
6187             enum coding_category category = coding_priorities[i];
6188             struct coding_system *this = coding_categories + category;
6189
6190             if (category >= coding_category_raw_text
6191                 || detected & (1 << category))
6192               continue;
6193
6194             if (this->id < 0)
6195               {
6196                 /* No coding system of this category is defined.  */
6197                 mask &= ~(1 << category);
6198               }
6199             else
6200               {
6201                 detected |= detected_mask[category];
6202                 if ((*(coding_categories[category].detector)) (&coding, &mask)
6203                     && highest)
6204                   {
6205                     mask &= detected_mask[category];
6206                     break;
6207                   }
6208               }
6209           }
6210     }
6211
6212   if (!mask)
6213     val = Fcons (make_number (coding_category_raw_text), Qnil);
6214   else if (mask == CATEGORY_MASK_ANY)
6215     val = Fcons (make_number (coding_category_undecided), Qnil);
6216   else if (highest)
6217     {
6218       for (i = 0; i < coding_category_raw_text; i++)
6219         if (mask & (1 << coding_priorities[i]))
6220           {
6221             val = Fcons (make_number (coding_priorities[i]), Qnil);
6222             break;
6223           }
6224     }
6225   else
6226     {
6227       val = Qnil;
6228       for (i = coding_category_raw_text - 1; i >= 0; i--)
6229         if (mask & (1 << coding_priorities[i]))
6230           val = Fcons (make_number (coding_priorities[i]), val);
6231     }
6232
6233   {
6234     int one_byte_eol = -1, two_byte_eol = -1;
6235     Lisp_Object tail;
6236
6237     for (tail = val; CONSP (tail); tail = XCDR (tail))
6238       {
6239         struct coding_system *this
6240           = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6241              : &coding);
6242         int this_eol;
6243
6244         attrs = CODING_ID_ATTRS (this->id);
6245         eol_type = CODING_ID_EOL_TYPE (this->id);
6246         XSETCAR (tail, CODING_ID_NAME (this->id));
6247         if (VECTORP (eol_type))
6248           {
6249             if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6250               {
6251                 if (two_byte_eol < 0)
6252                   two_byte_eol = detect_eol (this, coding.source, src_bytes);
6253                 this_eol = two_byte_eol;
6254               }
6255             else
6256               {
6257                 if (one_byte_eol < 0)
6258                   one_byte_eol =detect_eol (this, coding.source, src_bytes);
6259                 this_eol = one_byte_eol;
6260               }
6261             if (this_eol == EOL_SEEN_LF)
6262               XSETCAR (tail, AREF (eol_type, 0));
6263             else if (this_eol == EOL_SEEN_CRLF)
6264               XSETCAR (tail, AREF (eol_type, 1));
6265             else if (this_eol == EOL_SEEN_CR)
6266               XSETCAR (tail, AREF (eol_type, 2));
6267           }
6268       }
6269   }
6270
6271   return (highest ? XCAR (val) : val);
6272 }
6273
6274
6275 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6276        2, 3, 0,
6277        doc: /* Detect coding system of the text in the region between START and END.
6278 Return a list of possible coding systems ordered by priority.
6279
6280 If only ASCII characters are found, it returns a list of single element
6281 `undecided' or its subsidiary coding system according to a detected
6282 end-of-line format.
6283
6284 If optional argument HIGHEST is non-nil, return the coding system of
6285 highest priority.  */)
6286      (start, end, highest)
6287      Lisp_Object start, end, highest;
6288 {
6289   int from, to;
6290   int from_byte, to_byte;
6291
6292   CHECK_NUMBER_COERCE_MARKER (start);
6293   CHECK_NUMBER_COERCE_MARKER (end);
6294
6295   validate_region (&start, &end);
6296   from = XINT (start), to = XINT (end);
6297   from_byte = CHAR_TO_BYTE (from);
6298   to_byte = CHAR_TO_BYTE (to);
6299
6300   if (from < GPT && to >= GPT)
6301     move_gap_both (to, to_byte);
6302
6303   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6304                                to_byte - from_byte,
6305                                !NILP (highest),
6306                                !NILP (current_buffer
6307                                       ->enable_multibyte_characters),
6308                                Qnil);
6309 }
6310
6311 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6312        1, 2, 0,
6313        doc: /* Detect coding system of the text in STRING.
6314 Return a list of possible coding systems ordered by priority.
6315
6316 If only ASCII characters are found, it returns a list of single element
6317 `undecided' or its subsidiary coding system according to a detected
6318 end-of-line format.
6319
6320 If optional argument HIGHEST is non-nil, return the coding system of
6321 highest priority.  */)
6322      (string, highest)
6323      Lisp_Object string, highest;
6324 {
6325   CHECK_STRING (string);
6326
6327   return detect_coding_system (XSTRING (string)->data,
6328                                STRING_BYTES (XSTRING (string)),
6329                                !NILP (highest),
6330                                STRING_MULTIBYTE (string),
6331                                Qnil);
6332 }
6333
6334
6335 static INLINE int
6336 char_encodable_p (c, attrs)
6337      int c;
6338      Lisp_Object attrs;
6339 {
6340   Lisp_Object tail;
6341   struct charset *charset;
6342
6343   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6344        CONSP (tail); tail = XCDR (tail))
6345     {
6346       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6347       if (CHAR_CHARSET_P (c, charset))
6348         break;
6349     }
6350   return (! NILP (tail));
6351 }
6352
6353
6354 /* Return a list of coding systems that safely encode the text between
6355    START and END.  If EXCLUDE is non-nil, it is a list of coding
6356    systems not to check.  The returned list doesn't contain any such
6357    coding systems.  In any case, If the text contains only ASCII or is
6358    unibyte, return t.  */
6359
6360 DEFUN ("find-coding-systems-region-internal",
6361        Ffind_coding_systems_region_internal,
6362        Sfind_coding_systems_region_internal, 2, 3, 0,
6363        doc: /* Internal use only.  */)
6364      (start, end, exclude)
6365      Lisp_Object start, end, exclude;
6366 {
6367   Lisp_Object coding_attrs_list, safe_codings;
6368   EMACS_INT start_byte, end_byte;
6369   unsigned char *p, *pbeg, *pend;
6370   int c;
6371   Lisp_Object tail, elt;
6372
6373   if (STRINGP (start))
6374     {
6375       if (!STRING_MULTIBYTE (start)
6376           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6377         return Qt;
6378       start_byte = 0;
6379       end_byte = STRING_BYTES (XSTRING (start));
6380     }
6381   else
6382     {
6383       CHECK_NUMBER_COERCE_MARKER (start);
6384       CHECK_NUMBER_COERCE_MARKER (end);
6385       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6386         args_out_of_range (start, end);
6387       if (NILP (current_buffer->enable_multibyte_characters))
6388         return Qt;
6389       start_byte = CHAR_TO_BYTE (XINT (start));
6390       end_byte = CHAR_TO_BYTE (XINT (end));
6391       if (XINT (end) - XINT (start) == end_byte - start_byte)
6392         return Qt;
6393
6394       if (start < GPT && end > GPT)
6395         {
6396           if ((GPT - start) < (end - GPT))
6397             move_gap_both (start, start_byte);
6398           else
6399             move_gap_both (end, end_byte);
6400         }
6401     }
6402
6403   coding_attrs_list = Qnil;
6404   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6405     if (NILP (exclude)
6406         || NILP (Fmemq (XCAR (tail), exclude)))
6407       {
6408         Lisp_Object attrs;
6409
6410         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6411         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6412             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6413           coding_attrs_list = Fcons (attrs, coding_attrs_list);
6414       }
6415
6416   if (STRINGP (start))
6417     p = pbeg = XSTRING (start)->data;
6418   else
6419     p = pbeg = BYTE_POS_ADDR (start_byte);
6420   pend = p + (end_byte - start_byte);
6421
6422   while (p < pend && ASCII_BYTE_P (*p)) p++;
6423   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6424
6425   while (p < pend)
6426     {
6427       if (ASCII_BYTE_P (*p))
6428         p++;
6429       else
6430         {
6431           c = STRING_CHAR_ADVANCE (p);
6432
6433           charset_map_loaded = 0;
6434           for (tail = coding_attrs_list; CONSP (tail);)
6435             {
6436               elt = XCAR (tail);
6437               if (NILP (elt))
6438                 tail = XCDR (tail);
6439               else if (char_encodable_p (c, elt))
6440                 tail = XCDR (tail);
6441               else if (CONSP (XCDR (tail)))
6442                 {
6443                   XSETCAR (tail, XCAR (XCDR (tail)));
6444                   XSETCDR (tail, XCDR (XCDR (tail)));
6445                 }
6446               else
6447                 {
6448                   XSETCAR (tail, Qnil);
6449                   tail = XCDR (tail);
6450                 }
6451             }
6452           if (charset_map_loaded)
6453             {
6454               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6455
6456               if (STRINGP (start))
6457                 pbeg = XSTRING (start)->data;
6458               else
6459                 pbeg = BYTE_POS_ADDR (start_byte);
6460               p = pbeg + p_offset;
6461               pend = pbeg + pend_offset;
6462             }
6463         }
6464     }
6465
6466   safe_codings = Qnil;
6467   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6468     if (! NILP (XCAR (tail)))
6469       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6470
6471   return safe_codings;
6472 }
6473
6474
6475 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6476        Scheck_coding_systems_region, 3, 3, 0,
6477        doc: /* Check if the region is encodable by coding systems.
6478
6479 START and END are buffer positions specifying the region.
6480 CODING-SYSTEM-LIST is a list of coding systems to check.
6481
6482 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6483 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6484 whole region, POS0, POS1, ... are buffer positions where non-encodable
6485 characters are found.
6486
6487 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6488 value is nil.
6489
6490 START may be a string.  In that case, check if the string is
6491 encodable, and the value contains indices to the string instead of
6492 buffer positions.  END is ignored.  */)
6493      (start, end, coding_system_list)
6494      Lisp_Object start, end, coding_system_list;
6495 {
6496   Lisp_Object list;
6497   EMACS_INT start_byte, end_byte;
6498   int pos;
6499   unsigned char *p, *pbeg, *pend;
6500   int c;
6501   Lisp_Object tail, elt;
6502
6503   if (STRINGP (start))
6504     {
6505       if (!STRING_MULTIBYTE (start)
6506           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6507         return Qnil;
6508       start_byte = 0;
6509       end_byte = STRING_BYTES (XSTRING (start));
6510       pos = 0;
6511     }
6512   else
6513     {
6514       CHECK_NUMBER_COERCE_MARKER (start);
6515       CHECK_NUMBER_COERCE_MARKER (end);
6516       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6517         args_out_of_range (start, end);
6518       if (NILP (current_buffer->enable_multibyte_characters))
6519         return Qnil;
6520       start_byte = CHAR_TO_BYTE (XINT (start));
6521       end_byte = CHAR_TO_BYTE (XINT (end));
6522       if (XINT (end) - XINT (start) == end_byte - start_byte)
6523         return Qt;
6524
6525       if (start < GPT && end > GPT)
6526         {
6527           if ((GPT - start) < (end - GPT))
6528             move_gap_both (start, start_byte);
6529           else
6530             move_gap_both (end, end_byte);
6531         }
6532       pos = start;
6533     }
6534
6535   list = Qnil;
6536   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6537     {
6538       elt = XCAR (tail);
6539       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6540                                        Qnil)),
6541                     list);
6542     }
6543
6544   if (STRINGP (start))
6545     p = pbeg = XSTRING (start)->data;
6546   else
6547     p = pbeg = BYTE_POS_ADDR (start_byte);
6548   pend = p + (end_byte - start_byte);
6549
6550   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6551   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6552
6553   while (p < pend)
6554     {
6555       if (ASCII_BYTE_P (*p))
6556         p++;
6557       else
6558         {
6559           c = STRING_CHAR_ADVANCE (p);
6560
6561           charset_map_loaded = 0;
6562           for (tail = list; CONSP (tail); tail = XCDR (tail))
6563             {
6564               elt = XCDR (XCAR (tail));
6565               if (! char_encodable_p (c, XCAR (elt)))
6566                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6567             }
6568           if (charset_map_loaded)
6569             {
6570               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6571
6572               if (STRINGP (start))
6573                 pbeg = XSTRING (start)->data;
6574               else
6575                 pbeg = BYTE_POS_ADDR (start_byte);
6576               p = pbeg + p_offset;
6577               pend = pbeg + pend_offset;
6578             }
6579         }
6580       pos++;
6581     }
6582
6583   tail = list;
6584   list = Qnil;
6585   for (; CONSP (tail); tail = XCDR (tail))
6586     {
6587       elt = XCAR (tail);
6588       if (CONSP (XCDR (XCDR (elt))))
6589         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6590                       list);
6591     }
6592
6593   return list;
6594 }
6595
6596
6597
6598 Lisp_Object
6599 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6600      Lisp_Object start, end, coding_system, dst_object;
6601      int encodep, norecord;
6602 {
6603   struct coding_system coding;
6604   EMACS_INT from, from_byte, to, to_byte;
6605   Lisp_Object src_object;
6606
6607   CHECK_NUMBER_COERCE_MARKER (start);
6608   CHECK_NUMBER_COERCE_MARKER (end);
6609   if (NILP (coding_system))
6610     coding_system = Qno_conversion;
6611   else
6612     CHECK_CODING_SYSTEM (coding_system);
6613   src_object = Fcurrent_buffer ();
6614   if (NILP (dst_object))
6615     dst_object = src_object;
6616   else if (! EQ (dst_object, Qt))
6617     CHECK_BUFFER (dst_object);
6618
6619   validate_region (&start, &end);
6620   from = XFASTINT (start);
6621   from_byte = CHAR_TO_BYTE (from);
6622   to = XFASTINT (end);
6623   to_byte = CHAR_TO_BYTE (to);
6624
6625   setup_coding_system (coding_system, &coding);
6626   coding.mode |= CODING_MODE_LAST_BLOCK;
6627
6628   if (encodep)
6629     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6630                           dst_object);
6631   else
6632     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6633                           dst_object);
6634   if (! norecord)
6635     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6636
6637   if (coding.result != CODING_RESULT_SUCCESS)
6638     error ("Code conversion error: %d", coding.result);
6639
6640   return (BUFFERP (dst_object)
6641           ? make_number (coding.produced_char)
6642           : coding.dst_object);
6643 }
6644
6645
6646 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6647        3, 4, "r\nzCoding system: ",
6648        doc: /* Decode the current region from the specified coding system.
6649 When called from a program, takes four arguments:
6650         START, END, CODING-SYSTEM, and DESTINATION.
6651 START and END are buffer positions.
6652
6653 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6654 If nil, the region between START and END is replace by the decoded text.
6655 If buffer, the decoded text is inserted in the buffer.
6656 If t, the decoded text is returned.
6657
6658 This function sets `last-coding-system-used' to the precise coding system
6659 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6660 not fully specified.)
6661 It returns the length of the decoded text.  */)
6662      (start, end, coding_system, destination)
6663      Lisp_Object start, end, coding_system, destination;
6664 {
6665   return code_convert_region (start, end, coding_system, destination, 0, 0);
6666 }
6667
6668 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6669        3, 4, "r\nzCoding system: ",
6670        doc: /* Encode the current region by specified coding system.
6671 When called from a program, takes three arguments:
6672 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6673
6674 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6675 If nil, the region between START and END is replace by the encoded text.
6676 If buffer, the encoded text is inserted in the buffer.
6677 If t, the encoded text is returned.
6678
6679 This function sets `last-coding-system-used' to the precise coding system
6680 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6681 not fully specified.)
6682 It returns the length of the encoded text.  */)
6683   (start, end, coding_system, destination)
6684      Lisp_Object start, end, coding_system, destination;
6685 {
6686   return code_convert_region (start, end, coding_system, destination, 1, 0);
6687 }
6688
6689 Lisp_Object
6690 code_convert_string (string, coding_system, dst_object,
6691                      encodep, nocopy, norecord)
6692      Lisp_Object string, coding_system, dst_object;
6693      int encodep, nocopy, norecord;
6694 {
6695   struct coding_system coding;
6696   EMACS_INT chars, bytes;
6697
6698   CHECK_STRING (string);
6699   if (NILP (coding_system))
6700     {
6701       if (! norecord)
6702         Vlast_coding_system_used = Qno_conversion;
6703       if (NILP (dst_object))
6704         return (nocopy ? Fcopy_sequence (string) : string);
6705     }
6706
6707   if (NILP (coding_system))
6708     coding_system = Qno_conversion;
6709   else
6710     CHECK_CODING_SYSTEM (coding_system);
6711   if (NILP (dst_object))
6712     dst_object = Qt;
6713   else if (! EQ (dst_object, Qt))
6714     CHECK_BUFFER (dst_object);
6715
6716   setup_coding_system (coding_system, &coding);
6717   coding.mode |= CODING_MODE_LAST_BLOCK;
6718   chars = XSTRING (string)->size;
6719   bytes = STRING_BYTES (XSTRING (string));
6720   if (encodep)
6721     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6722   else
6723     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6724   if (! norecord)
6725     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6726
6727   if (coding.result != CODING_RESULT_SUCCESS)
6728     error ("Code conversion error: %d", coding.result);
6729
6730   return (BUFFERP (dst_object)
6731           ? make_number (coding.produced_char)
6732           : coding.dst_object);
6733 }
6734
6735
6736 /* Encode or decode STRING according to CODING_SYSTEM.
6737    Do not set Vlast_coding_system_used.
6738
6739    This function is called only from macros DECODE_FILE and
6740    ENCODE_FILE, thus we ignore character composition.  */
6741
6742 Lisp_Object
6743 code_convert_string_norecord (string, coding_system, encodep)
6744      Lisp_Object string, coding_system;
6745      int encodep;
6746 {
6747   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6748 }
6749
6750
6751 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6752        2, 4, 0,
6753        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6754
6755 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6756 if the decoding operation is trivial.
6757
6758 Optional fourth arg BUFFER non-nil meant that the decoded text is
6759 inserted in BUFFER instead of returned as a astring.  In this case,
6760 the return value is BUFFER.
6761
6762 This function sets `last-coding-system-used' to the precise coding system
6763 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6764 not fully specified.  */)
6765   (string, coding_system, nocopy, buffer)
6766      Lisp_Object string, coding_system, nocopy, buffer;
6767 {
6768   return code_convert_string (string, coding_system, buffer,
6769                               0, ! NILP (nocopy), 0);
6770 }
6771
6772 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6773        2, 4, 0,
6774        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6775
6776 Optional third arg NOCOPY non-nil means it is OK to return STRING
6777 itself if the encoding operation is trivial.
6778
6779 Optional fourth arg BUFFER non-nil meant that the encoded text is
6780 inserted in BUFFER instead of returned as a astring.  In this case,
6781 the return value is BUFFER.
6782
6783 This function sets `last-coding-system-used' to the precise coding system
6784 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6785 not fully specified.)  */)
6786      (string, coding_system, nocopy, buffer)
6787      Lisp_Object string, coding_system, nocopy, buffer;
6788 {
6789   return code_convert_string (string, coding_system, buffer,
6790                               nocopy, ! NILP (nocopy), 1);
6791 }
6792
6793 \f
6794 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6795        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6796 Return the corresponding character.  */)
6797      (code)
6798      Lisp_Object code;
6799 {
6800   Lisp_Object spec, attrs, val;
6801   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6802   int c;
6803
6804   CHECK_NATNUM (code);
6805   c = XFASTINT (code);
6806   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6807   attrs = AREF (spec, 0);
6808
6809   if (ASCII_BYTE_P (c)
6810       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6811     return code;
6812
6813   val = CODING_ATTR_CHARSET_LIST (attrs);
6814   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6815   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6816   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
6817
6818   if (c <= 0x7F)
6819     charset = charset_roman;
6820   else if (c >= 0xA0 && c < 0xDF)
6821     {
6822       charset = charset_kana;
6823       c -= 0x80;
6824     }
6825   else
6826     {
6827       int s1 = c >> 8, s2 = c & 0x7F;
6828
6829       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6830           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6831         error ("Invalid code: %d", code);
6832       SJIS_TO_JIS (c);
6833       charset = charset_kanji;
6834     }
6835   c = DECODE_CHAR (charset, c);
6836   if (c < 0)
6837     error ("Invalid code: %d", code);
6838   return make_number (c);
6839 }
6840
6841
6842 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6843        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6844 Return the corresponding code in SJIS.  */)
6845      (ch)
6846     Lisp_Object ch;
6847 {
6848   Lisp_Object spec, attrs, charset_list;
6849   int c;
6850   struct charset *charset;
6851   unsigned code;
6852
6853   CHECK_CHARACTER (ch);
6854   c = XFASTINT (ch);
6855   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6856   attrs = AREF (spec, 0);
6857
6858   if (ASCII_CHAR_P (c)
6859       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6860     return ch;
6861
6862   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6863   charset = char_charset (c, charset_list, &code);
6864   if (code == CHARSET_INVALID_CODE (charset))
6865     error ("Can't encode by shift_jis encoding: %d", c);
6866   JIS_TO_SJIS (code);
6867
6868   return make_number (code);
6869 }
6870
6871 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
6872        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
6873 Return the corresponding character.  */)
6874      (code)
6875      Lisp_Object code;
6876 {
6877   Lisp_Object spec, attrs, val;
6878   struct charset *charset_roman, *charset_big5, *charset;
6879   int c;
6880
6881   CHECK_NATNUM (code);
6882   c = XFASTINT (code);
6883   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6884   attrs = AREF (spec, 0);
6885
6886   if (ASCII_BYTE_P (c)
6887       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6888     return code;
6889
6890   val = CODING_ATTR_CHARSET_LIST (attrs);
6891   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6892   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
6893
6894   if (c <= 0x7F)
6895     charset = charset_roman;
6896   else
6897     {
6898       int b1 = c >> 8, b2 = c & 0x7F;
6899       if (b1 < 0xA1 || b1 > 0xFE
6900           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
6901         error ("Invalid code: %d", code);
6902       charset = charset_big5;
6903     }
6904   c = DECODE_CHAR (charset, (unsigned )c);
6905   if (c < 0)
6906     error ("Invalid code: %d", code);
6907   return make_number (c);
6908 }
6909
6910 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
6911        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
6912 Return the corresponding character code in Big5.  */)
6913      (ch)
6914      Lisp_Object ch;
6915 {
6916   Lisp_Object spec, attrs, charset_list;
6917   struct charset *charset;
6918   int c;
6919   unsigned code;
6920
6921   CHECK_CHARACTER (ch);
6922   c = XFASTINT (ch);
6923   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
6924   attrs = AREF (spec, 0);
6925   if (ASCII_CHAR_P (c)
6926       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6927     return ch;
6928
6929   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
6930   charset = char_charset (c, charset_list, &code);
6931   if (code == CHARSET_INVALID_CODE (charset))
6932     error ("Can't encode by Big5 encoding: %d", c);
6933
6934   return make_number (code);
6935 }
6936
6937 \f
6938 DEFUN ("set-terminal-coding-system-internal",
6939        Fset_terminal_coding_system_internal,
6940        Sset_terminal_coding_system_internal, 1, 1, 0,
6941        doc: /* Internal use only.  */)
6942      (coding_system)
6943 {
6944   CHECK_SYMBOL (coding_system);
6945   setup_coding_system (Fcheck_coding_system (coding_system),
6946                         &terminal_coding);
6947
6948   /* We had better not send unsafe characters to terminal.  */
6949   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
6950   /* Characer composition should be disabled.  */
6951   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6952   terminal_coding.src_multibyte = 1;
6953   terminal_coding.dst_multibyte = 0;
6954   return Qnil;
6955 }
6956
6957 DEFUN ("set-safe-terminal-coding-system-internal",
6958        Fset_safe_terminal_coding_system_internal,
6959        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
6960        doc: /* Internal use only.  */)
6961      (coding_system)
6962 {
6963   CHECK_SYMBOL (coding_system);
6964   setup_coding_system (Fcheck_coding_system (coding_system),
6965                        &safe_terminal_coding);
6966   /* Characer composition should be disabled.  */
6967   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6968   safe_terminal_coding.src_multibyte = 1;
6969   safe_terminal_coding.dst_multibyte = 0;
6970   return Qnil;
6971 }
6972
6973 DEFUN ("terminal-coding-system",
6974        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
6975        doc: /* Return coding system specified for terminal output.  */)
6976      ()
6977 {
6978   return CODING_ID_NAME (terminal_coding.id);
6979 }
6980
6981 DEFUN ("set-keyboard-coding-system-internal",
6982        Fset_keyboard_coding_system_internal,
6983        Sset_keyboard_coding_system_internal, 1, 1, 0,
6984        doc: /* Internal use only.  */)
6985      (coding_system)
6986      Lisp_Object coding_system;
6987 {
6988   CHECK_SYMBOL (coding_system);
6989   setup_coding_system (Fcheck_coding_system (coding_system),
6990                        &keyboard_coding);
6991   /* Characer composition should be disabled.  */
6992   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6993   return Qnil;
6994 }
6995
6996 DEFUN ("keyboard-coding-system",
6997        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
6998        doc: /* Return coding system specified for decoding keyboard input.  */)
6999      ()
7000 {
7001   return CODING_ID_NAME (keyboard_coding.id);
7002 }
7003
7004 \f
7005 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7006        Sfind_operation_coding_system,  1, MANY, 0,
7007        doc: /* Choose a coding system for an operation based on the target name.
7008 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7009 DECODING-SYSTEM is the coding system to use for decoding
7010 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7011 for encoding (in case OPERATION does encoding).
7012
7013 The first argument OPERATION specifies an I/O primitive:
7014   For file I/O, `insert-file-contents' or `write-region'.
7015   For process I/O, `call-process', `call-process-region', or `start-process'.
7016   For network I/O, `open-network-stream'.
7017
7018 The remaining arguments should be the same arguments that were passed
7019 to the primitive.  Depending on which primitive, one of those arguments
7020 is selected as the TARGET.  For example, if OPERATION does file I/O,
7021 whichever argument specifies the file name is TARGET.
7022
7023 TARGET has a meaning which depends on OPERATION:
7024   For file I/O, TARGET is a file name.
7025   For process I/O, TARGET is a process name.
7026   For network I/O, TARGET is a service name or a port number
7027
7028 This function looks up what specified for TARGET in,
7029 `file-coding-system-alist', `process-coding-system-alist',
7030 or `network-coding-system-alist' depending on OPERATION.
7031 They may specify a coding system, a cons of coding systems,
7032 or a function symbol to call.
7033 In the last case, we call the function with one argument,
7034 which is a list of all the arguments given to this function.
7035
7036 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7037      (nargs, args)
7038      int nargs;
7039      Lisp_Object *args;
7040 {
7041   Lisp_Object operation, target_idx, target, val;
7042   register Lisp_Object chain;
7043
7044   if (nargs < 2)
7045     error ("Too few arguments");
7046   operation = args[0];
7047   if (!SYMBOLP (operation)
7048       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7049     error ("Invalid first arguement");
7050   if (nargs < 1 + XINT (target_idx))
7051     error ("Too few arguments for operation: %s",
7052            XSYMBOL (operation)->name->data);
7053   target = args[XINT (target_idx) + 1];
7054   if (!(STRINGP (target)
7055         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7056     error ("Invalid %dth argument", XINT (target_idx) + 1);
7057
7058   chain = ((EQ (operation, Qinsert_file_contents)
7059             || EQ (operation, Qwrite_region))
7060            ? Vfile_coding_system_alist
7061            : (EQ (operation, Qopen_network_stream)
7062               ? Vnetwork_coding_system_alist
7063               : Vprocess_coding_system_alist));
7064   if (NILP (chain))
7065     return Qnil;
7066
7067   for (; CONSP (chain); chain = XCDR (chain))
7068     {
7069       Lisp_Object elt;
7070
7071       elt = XCAR (chain);
7072       if (CONSP (elt)
7073           && ((STRINGP (target)
7074                && STRINGP (XCAR (elt))
7075                && fast_string_match (XCAR (elt), target) >= 0)
7076               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7077         {
7078           val = XCDR (elt);
7079           /* Here, if VAL is both a valid coding system and a valid
7080              function symbol, we return VAL as a coding system.  */
7081           if (CONSP (val))
7082             return val;
7083           if (! SYMBOLP (val))
7084             return Qnil;
7085           if (! NILP (Fcoding_system_p (val)))
7086             return Fcons (val, val);
7087           if (! NILP (Ffboundp (val)))
7088             {
7089               val = call1 (val, Flist (nargs, args));
7090               if (CONSP (val))
7091                 return val;
7092               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7093                 return Fcons (val, val);
7094             }
7095           return Qnil;
7096         }
7097     }
7098   return Qnil;
7099 }
7100
7101 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7102        Sset_coding_system_priority, 1, MANY, 0,
7103        doc: /* Put higher priority to coding systems of the arguments.  */)
7104      (nargs, args)
7105      int nargs;
7106      Lisp_Object *args;
7107 {
7108   int i, j;
7109   int changed[coding_category_max];
7110   enum coding_category priorities[coding_category_max];
7111
7112   bzero (changed, sizeof changed);
7113
7114   for (i = j = 0; i < nargs; i++)
7115     {
7116       enum coding_category category;
7117       Lisp_Object spec, attrs;
7118
7119       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7120       attrs = AREF (spec, 0);
7121       category = XINT (CODING_ATTR_CATEGORY (attrs));
7122       if (changed[category])
7123         /* Ignore this coding system because a coding system of the
7124            same category already had a higher priority.  */
7125         continue;
7126       changed[category] = 1;
7127       priorities[j++] = category;
7128       if (coding_categories[category].id >= 0
7129           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7130         setup_coding_system (args[i], &coding_categories[category]);
7131     }
7132
7133   /* Now we have decided top J priorities.  Reflect the order of the
7134      original priorities to the remaining priorities.  */
7135
7136   for (i = j, j = 0; i < coding_category_max; i++, j++)
7137     {
7138       while (j < coding_category_max
7139              && changed[coding_priorities[j]])
7140         j++;
7141       if (j == coding_category_max)
7142         abort ();
7143       priorities[i] = coding_priorities[j];
7144     }
7145
7146   bcopy (priorities, coding_priorities, sizeof priorities);
7147   return Qnil;
7148 }
7149
7150 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7151        Scoding_system_priority_list, 0, 1, 0,
7152        doc: /* Return a list of coding systems ordered by their priorities.  */)
7153      (highestp)
7154      Lisp_Object highestp;
7155 {
7156   int i;
7157   Lisp_Object val;
7158
7159   for (i = 0, val = Qnil; i < coding_category_max; i++)
7160     {
7161       enum coding_category category = coding_priorities[i];
7162       int id = coding_categories[category].id;
7163       Lisp_Object attrs;
7164
7165       if (id < 0)
7166         continue;
7167       attrs = CODING_ID_ATTRS (id);
7168       if (! NILP (highestp))
7169         return CODING_ATTR_BASE_NAME (attrs);
7170       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7171     }
7172   return Fnreverse (val);
7173 }
7174
7175 static Lisp_Object
7176 make_subsidiaries (base)
7177      Lisp_Object base;
7178 {
7179   Lisp_Object subsidiaries;
7180   char *suffixes[] = { "-unix", "-dos", "-mac" };
7181   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7182   char *buf = (char *) alloca (base_name_len + 6);
7183   int i;
7184
7185   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7186   subsidiaries = Fmake_vector (make_number (3), Qnil);
7187   for (i = 0; i < 3; i++)
7188     {
7189       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7190       ASET (subsidiaries, i, intern (buf));
7191     }
7192   return subsidiaries;
7193 }
7194
7195
7196 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7197        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7198        doc: /* For internal use only.  */)
7199      (nargs, args)
7200      int nargs;
7201      Lisp_Object *args;
7202 {
7203   Lisp_Object name;
7204   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7205   Lisp_Object attrs;            /* Vector of attributes.  */
7206   Lisp_Object eol_type;
7207   Lisp_Object aliases;
7208   Lisp_Object coding_type, charset_list, safe_charsets;
7209   enum coding_category category;
7210   Lisp_Object tail, val;
7211   int max_charset_id = 0;
7212   int i;
7213
7214   if (nargs < coding_arg_max)
7215     goto short_args;
7216
7217   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7218
7219   name = args[coding_arg_name];
7220   CHECK_SYMBOL (name);
7221   CODING_ATTR_BASE_NAME (attrs) = name;
7222
7223   val = args[coding_arg_mnemonic];
7224   if (! STRINGP (val))
7225     CHECK_CHARACTER (val);
7226   CODING_ATTR_MNEMONIC (attrs) = val;
7227
7228   coding_type = args[coding_arg_coding_type];
7229   CHECK_SYMBOL (coding_type);
7230   CODING_ATTR_TYPE (attrs) = coding_type;
7231
7232   charset_list = args[coding_arg_charset_list];
7233   if (SYMBOLP (charset_list))
7234     {
7235       if (EQ (charset_list, Qiso_2022))
7236         {
7237           if (! EQ (coding_type, Qiso_2022))
7238             error ("Invalid charset-list");
7239           charset_list = Viso_2022_charset_list;
7240         }
7241       else if (EQ (charset_list, Qemacs_mule))
7242         {
7243           if (! EQ (coding_type, Qemacs_mule))
7244             error ("Invalid charset-list");
7245           charset_list = Vemacs_mule_charset_list;
7246         }
7247       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7248         if (max_charset_id < XFASTINT (XCAR (tail)))
7249           max_charset_id = XFASTINT (XCAR (tail));
7250     }
7251   else
7252     {
7253       charset_list = Fcopy_sequence (charset_list);
7254       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7255         {
7256           struct charset *charset;
7257
7258           val = Fcar (tail);
7259           CHECK_CHARSET_GET_CHARSET (val, charset);
7260           if (EQ (coding_type, Qiso_2022)
7261               ? CHARSET_ISO_FINAL (charset) < 0
7262               : EQ (coding_type, Qemacs_mule)
7263               ? CHARSET_EMACS_MULE_ID (charset) < 0
7264               : 0)
7265             error ("Can't handle charset `%s'",
7266                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7267
7268           XCAR (tail) = make_number (charset->id);
7269           if (max_charset_id < charset->id)
7270             max_charset_id = charset->id;
7271         }
7272     }
7273   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7274
7275   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7276                                 make_number (255));
7277   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7278     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7279   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7280
7281   val = args[coding_arg_decode_translation_table];
7282   if (! NILP (val))
7283     CHECK_CHAR_TABLE (val);
7284   CODING_ATTR_DECODE_TBL (attrs) = val;
7285
7286   val = args[coding_arg_encode_translation_table];
7287   if (! NILP (val))
7288     CHECK_CHAR_TABLE (val);
7289   CODING_ATTR_ENCODE_TBL (attrs) = val;
7290
7291   val = args[coding_arg_post_read_conversion];
7292   CHECK_SYMBOL (val);
7293   CODING_ATTR_POST_READ (attrs) = val;
7294
7295   val = args[coding_arg_pre_write_conversion];
7296   CHECK_SYMBOL (val);
7297   CODING_ATTR_PRE_WRITE (attrs) = val;
7298
7299   val = args[coding_arg_default_char];
7300   if (NILP (val))
7301     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7302   else
7303     {
7304       CHECK_CHARACTER (val);
7305       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7306     }
7307
7308   val = args[coding_arg_plist];
7309   CHECK_LIST (val);
7310   CODING_ATTR_PLIST (attrs) = val;
7311
7312   if (EQ (coding_type, Qcharset))
7313     {
7314       val = Fmake_vector (make_number (256), Qnil);
7315
7316       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7317         {
7318           struct charset *charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7319
7320           for (i = charset->code_space[0]; i <= charset->code_space[1]; i++)
7321             if (NILP (AREF (val, i)))
7322               ASET (val, i, XCAR (tail));
7323         }
7324       ASET (attrs, coding_attr_charset_valids, val);
7325       category = coding_category_charset;
7326     }
7327   else if (EQ (coding_type, Qccl))
7328     {
7329       Lisp_Object valids;
7330
7331       if (nargs < coding_arg_ccl_max)
7332         goto short_args;
7333
7334       val = args[coding_arg_ccl_decoder];
7335       CHECK_CCL_PROGRAM (val);
7336       if (VECTORP (val))
7337         val = Fcopy_sequence (val);
7338       ASET (attrs, coding_attr_ccl_decoder, val);
7339
7340       val = args[coding_arg_ccl_encoder];
7341       CHECK_CCL_PROGRAM (val);
7342       if (VECTORP (val))
7343         val = Fcopy_sequence (val);
7344       ASET (attrs, coding_attr_ccl_encoder, val);
7345
7346       val = args[coding_arg_ccl_valids];
7347       valids = Fmake_string (make_number (256), make_number (0));
7348       for (tail = val; !NILP (tail); tail = Fcdr (tail))
7349         {
7350           val = Fcar (tail);
7351           if (INTEGERP (val))
7352             ASET (valids, XINT (val), 1);
7353           else
7354             {
7355               int from, to;
7356
7357               CHECK_CONS (val);
7358               CHECK_NUMBER (XCAR (val));
7359               CHECK_NUMBER (XCDR (val));
7360               from = XINT (XCAR (val));
7361               to = XINT (XCDR (val));
7362               for (i = from; i <= to; i++)
7363                 ASET (valids, i, 1);
7364             }
7365         }
7366       ASET (attrs, coding_attr_ccl_valids, valids);
7367
7368       category = coding_category_ccl;
7369     }
7370   else if (EQ (coding_type, Qutf_16))
7371     {
7372       Lisp_Object bom, endian;
7373
7374       if (nargs < coding_arg_utf16_max)
7375         goto short_args;
7376
7377       bom = args[coding_arg_utf16_bom];
7378       if (! NILP (bom) && ! EQ (bom, Qt))
7379         {
7380           CHECK_CONS (bom);
7381           CHECK_CODING_SYSTEM (XCAR (bom));
7382           CHECK_CODING_SYSTEM (XCDR (bom));
7383         }
7384       ASET (attrs, coding_attr_utf_16_bom, bom);
7385
7386       endian = args[coding_arg_utf16_endian];
7387       ASET (attrs, coding_attr_utf_16_endian, endian);
7388
7389       category = (CONSP (bom)
7390                   ? coding_category_utf_16_auto
7391                   : NILP (bom)
7392                   ? (NILP (endian)
7393                      ? coding_category_utf_16_be_nosig
7394                      : coding_category_utf_16_le_nosig)
7395                   : (NILP (endian)
7396                      ? coding_category_utf_16_be
7397                      : coding_category_utf_16_le));
7398     }
7399   else if (EQ (coding_type, Qiso_2022))
7400     {
7401       Lisp_Object initial, reg_usage, request, flags;
7402       struct charset *charset;
7403       int i, id;
7404
7405       if (nargs < coding_arg_iso2022_max)
7406         goto short_args;
7407
7408       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7409       CHECK_VECTOR (initial);
7410       for (i = 0; i < 4; i++)
7411         {
7412           val = Faref (initial, make_number (i));
7413           if (! NILP (val))
7414             {
7415               CHECK_CHARSET_GET_ID (val, id);
7416               ASET (initial, i, make_number (id));
7417             }
7418           else
7419             ASET (initial, i, make_number (-1));
7420         }
7421
7422       reg_usage = args[coding_arg_iso2022_reg_usage];
7423       CHECK_CONS (reg_usage);
7424       CHECK_NATNUM (XCAR (reg_usage));
7425       CHECK_NATNUM (XCDR (reg_usage));
7426
7427       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7428       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7429         {
7430           int id;
7431
7432           val = Fcar (tail);
7433           CHECK_CONS (val);
7434           CHECK_CHARSET_GET_ID (XCAR (val), id);
7435           CHECK_NATNUM (XCDR (val));
7436           if (XINT (XCDR (val)) >= 4)
7437             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7438           XCAR (val) = make_number (id);
7439         }
7440
7441       flags = args[coding_arg_iso2022_flags];
7442       CHECK_NATNUM (flags);
7443       i = XINT (flags);
7444       if (EQ (args[coding_arg_charset_list], Qiso_2022))
7445         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7446
7447       ASET (attrs, coding_attr_iso_initial, initial);
7448       ASET (attrs, coding_attr_iso_usage, reg_usage);
7449       ASET (attrs, coding_attr_iso_request, request);
7450       ASET (attrs, coding_attr_iso_flags, flags);
7451       setup_iso_safe_charsets (attrs);
7452
7453       if (i & CODING_ISO_FLAG_SEVEN_BITS)
7454         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7455                           | CODING_ISO_FLAG_SINGLE_SHIFT))
7456                     ? coding_category_iso_7_else
7457                     : EQ (args[coding_arg_charset_list], Qiso_2022)
7458                     ? coding_category_iso_7
7459                     : coding_category_iso_7_tight);
7460       else
7461         {
7462           int id = XINT (AREF (initial, 1));
7463
7464           category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7465                              | CODING_ISO_FLAG_SINGLE_SHIFT))
7466                        || EQ (args[coding_arg_charset_list], Qiso_2022)
7467                        || id < 0)
7468                       ? coding_category_iso_8_else
7469                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7470                       ? coding_category_iso_8_1
7471                       : coding_category_iso_8_2);
7472         }
7473     }
7474   else if (EQ (coding_type, Qemacs_mule))
7475     {
7476       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7477         ASET (attrs, coding_attr_emacs_mule_full, Qt);
7478
7479       category = coding_category_emacs_mule;
7480     }
7481   else if (EQ (coding_type, Qshift_jis))
7482     {
7483
7484       struct charset *charset;
7485
7486       if (XINT (Flength (charset_list)) != 3)
7487         error ("There should be just three charsets");
7488
7489       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7490       if (CHARSET_DIMENSION (charset) != 1)
7491         error ("Dimension of charset %s is not one",
7492                XSYMBOL (CHARSET_NAME (charset))->name->data);
7493
7494       charset_list = XCDR (charset_list);
7495       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7496       if (CHARSET_DIMENSION (charset) != 1)
7497         error ("Dimension of charset %s is not one",
7498                XSYMBOL (CHARSET_NAME (charset))->name->data);
7499
7500       charset_list = XCDR (charset_list);
7501       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7502       if (CHARSET_DIMENSION (charset) != 2)
7503         error ("Dimension of charset %s is not two",
7504                XSYMBOL (CHARSET_NAME (charset))->name->data);
7505
7506       category = coding_category_sjis;
7507       Vsjis_coding_system = name;
7508     }
7509   else if (EQ (coding_type, Qbig5))
7510     {
7511       struct charset *charset;
7512
7513       if (XINT (Flength (charset_list)) != 2)
7514         error ("There should be just two charsets");
7515
7516       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7517       if (CHARSET_DIMENSION (charset) != 1)
7518         error ("Dimension of charset %s is not one",
7519                XSYMBOL (CHARSET_NAME (charset))->name->data);
7520
7521       charset_list = XCDR (charset_list);
7522       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7523       if (CHARSET_DIMENSION (charset) != 2)
7524         error ("Dimension of charset %s is not two",
7525                XSYMBOL (CHARSET_NAME (charset))->name->data);
7526
7527       category = coding_category_big5;
7528       Vbig5_coding_system = name;
7529     }
7530   else if (EQ (coding_type, Qraw_text))
7531     category = coding_category_raw_text;
7532   else if (EQ (coding_type, Qutf_8))
7533     category = coding_category_utf_8;
7534   else if (EQ (coding_type, Qundecided))
7535     category = coding_category_undecided;
7536   else
7537     error ("Invalid coding system type: %s",
7538            XSYMBOL (coding_type)->name->data);
7539
7540   CODING_ATTR_CATEGORY (attrs) = make_number (category);
7541
7542   eol_type = args[coding_arg_eol_type];
7543   if (! NILP (eol_type)
7544       && ! EQ (eol_type, Qunix)
7545       && ! EQ (eol_type, Qdos)
7546       && ! EQ (eol_type, Qmac))
7547     error ("Invalid eol-type");
7548
7549   aliases = Fcons (name, Qnil);
7550
7551   if (NILP (eol_type))
7552     {
7553       eol_type = make_subsidiaries (name);
7554       for (i = 0; i < 3; i++)
7555         {
7556           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7557
7558           this_name = AREF (eol_type, i);
7559           this_aliases = Fcons (this_name, Qnil);
7560           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7561           this_spec = Fmake_vector (make_number (3), attrs);
7562           ASET (this_spec, 1, this_aliases);
7563           ASET (this_spec, 2, this_eol_type);
7564           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7565           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7566           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7567                                         Vcoding_system_alist);
7568         }
7569     }
7570
7571   spec_vec = Fmake_vector (make_number (3), attrs);
7572   ASET (spec_vec, 1, aliases);
7573   ASET (spec_vec, 2, eol_type);
7574
7575   Fputhash (name, spec_vec, Vcoding_system_hash_table);
7576   Vcoding_system_list = Fcons (name, Vcoding_system_list);
7577   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7578                                 Vcoding_system_alist);
7579
7580   {
7581     int id = coding_categories[category].id;
7582
7583     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7584       setup_coding_system (name, &coding_categories[category]);
7585   }
7586
7587   return Qnil;
7588
7589  short_args:
7590   return Fsignal (Qwrong_number_of_arguments,
7591                   Fcons (intern ("define-coding-system-internal"),
7592                          make_number (nargs)));
7593 }
7594
7595 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7596        Sdefine_coding_system_alias, 2, 2, 0,
7597        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
7598      (alias, coding_system)
7599      Lisp_Object alias, coding_system;
7600 {
7601   Lisp_Object spec, aliases, eol_type;
7602
7603   CHECK_SYMBOL (alias);
7604   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7605   aliases = AREF (spec, 1);
7606   while (!NILP (XCDR (aliases)))
7607     aliases = XCDR (aliases);
7608   XCDR (aliases) = Fcons (alias, Qnil);
7609
7610   eol_type = AREF (spec, 2);
7611   if (VECTORP (eol_type))
7612     {
7613       Lisp_Object subsidiaries;
7614       int i;
7615
7616       subsidiaries = make_subsidiaries (alias);
7617       for (i = 0; i < 3; i++)
7618         Fdefine_coding_system_alias (AREF (subsidiaries, i),
7619                                      AREF (eol_type, i));
7620
7621       ASET (spec, 2, subsidiaries);
7622     }
7623
7624   Fputhash (alias, spec, Vcoding_system_hash_table);
7625   Vcoding_system_alist = Fcons (Fcons (alias, Qnil), Vcoding_system_alist);
7626
7627   return Qnil;
7628 }
7629
7630 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7631        1, 1, 0,
7632        doc: /* Return the base of CODING-SYSTEM.
7633 Any alias or subsidiary coding systems are not base coding system.  */)
7634   (coding_system)
7635      Lisp_Object coding_system;
7636 {
7637   Lisp_Object spec, attrs;
7638
7639   if (NILP (coding_system))
7640     return (Qno_conversion);
7641   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7642   attrs = AREF (spec, 0);
7643   return CODING_ATTR_BASE_NAME (attrs);
7644 }
7645
7646 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7647        1, 1, 0,
7648        doc: "Return the property list of CODING-SYSTEM.")
7649      (coding_system)
7650      Lisp_Object coding_system;
7651 {
7652   Lisp_Object spec, attrs;
7653
7654   if (NILP (coding_system))
7655     coding_system = Qno_conversion;
7656   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7657   attrs = AREF (spec, 0);
7658   return CODING_ATTR_PLIST (attrs);
7659 }
7660
7661
7662 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7663        1, 1, 0,
7664        doc: /* Return the list of aliases of CODING-SYSTEM.
7665 A base coding system is what made by `define-coding-system'.
7666 Any alias nor subsidiary coding systems are not base coding system.  */)
7667      (coding_system)
7668      Lisp_Object coding_system;
7669 {
7670   Lisp_Object spec;
7671
7672   if (NILP (coding_system))
7673     coding_system = Qno_conversion;
7674   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7675   return AREF (spec, 2);
7676 }
7677
7678 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7679        Scoding_system_eol_type, 1, 1, 0,
7680        doc: /* Return eol-type of CODING-SYSTEM.
7681 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7682
7683 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7684 and CR respectively.
7685
7686 A vector value indicates that a format of end-of-line should be
7687 detected automatically.  Nth element of the vector is the subsidiary
7688 coding system whose eol-type is N.  */)
7689      (coding_system)
7690      Lisp_Object coding_system;
7691 {
7692   Lisp_Object spec, eol_type;
7693   int n;
7694
7695   if (NILP (coding_system))
7696     coding_system = Qno_conversion;
7697   if (! CODING_SYSTEM_P (coding_system))
7698     return Qnil;
7699   spec = CODING_SYSTEM_SPEC (coding_system);
7700   eol_type = AREF (spec, 2);
7701   if (VECTORP (eol_type))
7702     return Fcopy_sequence (eol_type);
7703   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7704   return make_number (n);
7705 }
7706
7707 #endif /* emacs */
7708
7709 \f
7710 /*** 9. Post-amble ***/
7711
7712 void
7713 init_coding_once ()
7714 {
7715   int i;
7716
7717   for (i = 0; i < coding_category_max; i++)
7718     {
7719       coding_categories[i].id = -1;
7720       coding_priorities[i] = i;
7721     }
7722
7723   /* ISO2022 specific initialize routine.  */
7724   for (i = 0; i < 0x20; i++)
7725     iso_code_class[i] = ISO_control_0;
7726   for (i = 0x21; i < 0x7F; i++)
7727     iso_code_class[i] = ISO_graphic_plane_0;
7728   for (i = 0x80; i < 0xA0; i++)
7729     iso_code_class[i] = ISO_control_1;
7730   for (i = 0xA1; i < 0xFF; i++)
7731     iso_code_class[i] = ISO_graphic_plane_1;
7732   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7733   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7734   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7735   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7736   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7737   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7738   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7739   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7740   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7741   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7742
7743   inhibit_pre_post_conversion = 0;
7744
7745   for (i = 0; i < 256; i++)
7746     {
7747       emacs_mule_bytes[i] = 1;
7748     }
7749 }
7750
7751 #ifdef emacs
7752
7753 void
7754 syms_of_coding ()
7755 {
7756   staticpro (&Vcoding_system_hash_table);
7757   Vcoding_system_hash_table = Fmakehash (Qeq);
7758
7759   staticpro (&Vsjis_coding_system);
7760   Vsjis_coding_system = Qnil;
7761
7762   staticpro (&Vbig5_coding_system);
7763   Vbig5_coding_system = Qnil;
7764
7765   staticpro (&Vcode_conversion_work_buf_list);
7766   Vcode_conversion_work_buf_list = Qnil;
7767
7768   staticpro (&Vcode_conversion_reused_work_buf);
7769   Vcode_conversion_reused_work_buf = Qnil;
7770
7771   DEFSYM (Qcharset, "charset");
7772   DEFSYM (Qtarget_idx, "target-idx");
7773   DEFSYM (Qcoding_system_history, "coding-system-history");
7774   Fset (Qcoding_system_history, Qnil);
7775
7776   /* Target FILENAME is the first argument.  */
7777   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7778   /* Target FILENAME is the third argument.  */
7779   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7780
7781   DEFSYM (Qcall_process, "call-process");
7782   /* Target PROGRAM is the first argument.  */
7783   Fput (Qcall_process, Qtarget_idx, make_number (0));
7784
7785   DEFSYM (Qcall_process_region, "call-process-region");
7786   /* Target PROGRAM is the third argument.  */
7787   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7788
7789   DEFSYM (Qstart_process, "start-process");
7790   /* Target PROGRAM is the third argument.  */
7791   Fput (Qstart_process, Qtarget_idx, make_number (2));
7792
7793   DEFSYM (Qopen_network_stream, "open-network-stream");
7794   /* Target SERVICE is the fourth argument.  */
7795   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
7796
7797   DEFSYM (Qcoding_system, "coding-system");
7798   DEFSYM (Qcoding_aliases, "coding-aliases");
7799
7800   DEFSYM (Qeol_type, "eol-type");
7801   DEFSYM (Qunix, "unix");
7802   DEFSYM (Qdos, "dos");
7803   DEFSYM (Qmac, "mac");
7804
7805   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
7806   DEFSYM (Qpost_read_conversion, "post-read-conversion");
7807   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
7808   DEFSYM (Qdefault_char, "default-char");
7809   DEFSYM (Qundecided, "undecided");
7810   DEFSYM (Qno_conversion, "no-conversion");
7811   DEFSYM (Qraw_text, "raw-text");
7812
7813   DEFSYM (Qiso_2022, "iso-2022");
7814
7815   DEFSYM (Qutf_8, "utf-8");
7816
7817   DEFSYM (Qutf_16, "utf-16");
7818   DEFSYM (Qutf_16_be, "utf-16-be");
7819   DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
7820   DEFSYM (Qutf_16_le, "utf-16-l3");
7821   DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
7822   DEFSYM (Qsignature, "signature");
7823   DEFSYM (Qendian, "endian");
7824   DEFSYM (Qbig, "big");
7825   DEFSYM (Qlittle, "little");
7826
7827   DEFSYM (Qshift_jis, "shift-jis");
7828   DEFSYM (Qbig5, "big5");
7829
7830   DEFSYM (Qcoding_system_p, "coding-system-p");
7831
7832   DEFSYM (Qcoding_system_error, "coding-system-error");
7833   Fput (Qcoding_system_error, Qerror_conditions,
7834         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
7835   Fput (Qcoding_system_error, Qerror_message,
7836         build_string ("Invalid coding system"));
7837
7838   /* Intern this now in case it isn't already done.
7839      Setting this variable twice is harmless.
7840      But don't staticpro it here--that is done in alloc.c.  */
7841   Qchar_table_extra_slots = intern ("char-table-extra-slots");
7842
7843   DEFSYM (Qtranslation_table, "translation-table");
7844   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
7845   DEFSYM (Qtranslation_table_id, "translation-table-id");
7846   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
7847   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
7848
7849   DEFSYM (Qchar_coding_system, "char-coding-system");
7850
7851   Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2));
7852
7853   DEFSYM (Qvalid_codes, "valid-codes");
7854
7855   DEFSYM (Qemacs_mule, "emacs-mule");
7856
7857   Vcoding_category_table
7858     = Fmake_vector (make_number (coding_category_max), Qnil);
7859   staticpro (&Vcoding_category_table);
7860   /* Followings are target of code detection.  */
7861   ASET (Vcoding_category_table, coding_category_iso_7,
7862         intern ("coding-category-iso-7"));
7863   ASET (Vcoding_category_table, coding_category_iso_7_tight,
7864         intern ("coding-category-iso-7-tight"));
7865   ASET (Vcoding_category_table, coding_category_iso_8_1,
7866         intern ("coding-category-iso-8-1"));
7867   ASET (Vcoding_category_table, coding_category_iso_8_2,
7868         intern ("coding-category-iso-8-2"));
7869   ASET (Vcoding_category_table, coding_category_iso_7_else,
7870         intern ("coding-category-iso-7-else"));
7871   ASET (Vcoding_category_table, coding_category_iso_8_else,
7872         intern ("coding-category-iso-8-else"));
7873   ASET (Vcoding_category_table, coding_category_utf_8,
7874         intern ("coding-category-utf-8"));
7875   ASET (Vcoding_category_table, coding_category_utf_16_be,
7876         intern ("coding-category-utf-16-be"));
7877   ASET (Vcoding_category_table, coding_category_utf_16_le,
7878         intern ("coding-category-utf-16-le"));
7879   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
7880         intern ("coding-category-utf-16-be-nosig"));
7881   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
7882         intern ("coding-category-utf-16-le-nosig"));
7883   ASET (Vcoding_category_table, coding_category_charset,
7884         intern ("coding-category-charset"));
7885   ASET (Vcoding_category_table, coding_category_sjis,
7886         intern ("coding-category-sjis"));
7887   ASET (Vcoding_category_table, coding_category_big5,
7888         intern ("coding-category-big5"));
7889   ASET (Vcoding_category_table, coding_category_ccl,
7890         intern ("coding-category-ccl"));
7891   ASET (Vcoding_category_table, coding_category_emacs_mule,
7892         intern ("coding-category-emacs-mule"));
7893   /* Followings are NOT target of code detection.  */
7894   ASET (Vcoding_category_table, coding_category_raw_text,
7895         intern ("coding-category-raw-text"));
7896   ASET (Vcoding_category_table, coding_category_undecided,
7897         intern ("coding-category-undecided"));
7898
7899   defsubr (&Scoding_system_p);
7900   defsubr (&Sread_coding_system);
7901   defsubr (&Sread_non_nil_coding_system);
7902   defsubr (&Scheck_coding_system);
7903   defsubr (&Sdetect_coding_region);
7904   defsubr (&Sdetect_coding_string);
7905   defsubr (&Sfind_coding_systems_region_internal);
7906   defsubr (&Scheck_coding_systems_region);
7907   defsubr (&Sdecode_coding_region);
7908   defsubr (&Sencode_coding_region);
7909   defsubr (&Sdecode_coding_string);
7910   defsubr (&Sencode_coding_string);
7911   defsubr (&Sdecode_sjis_char);
7912   defsubr (&Sencode_sjis_char);
7913   defsubr (&Sdecode_big5_char);
7914   defsubr (&Sencode_big5_char);
7915   defsubr (&Sset_terminal_coding_system_internal);
7916   defsubr (&Sset_safe_terminal_coding_system_internal);
7917   defsubr (&Sterminal_coding_system);
7918   defsubr (&Sset_keyboard_coding_system_internal);
7919   defsubr (&Skeyboard_coding_system);
7920   defsubr (&Sfind_operation_coding_system);
7921   defsubr (&Sset_coding_system_priority);
7922   defsubr (&Sdefine_coding_system_internal);
7923   defsubr (&Sdefine_coding_system_alias);
7924   defsubr (&Scoding_system_base);
7925   defsubr (&Scoding_system_plist);
7926   defsubr (&Scoding_system_aliases);
7927   defsubr (&Scoding_system_eol_type);
7928   defsubr (&Scoding_system_priority_list);
7929
7930   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
7931                doc: /* List of coding systems.
7932
7933 Do not alter the value of this variable manually.  This variable should be
7934 updated by the functions `define-coding-system' and
7935 `define-coding-system-alias'.  */);
7936   Vcoding_system_list = Qnil;
7937
7938   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
7939                doc: /* Alist of coding system names.
7940 Each element is one element list of coding system name.
7941 This variable is given to `completing-read' as TABLE argument.
7942
7943 Do not alter the value of this variable manually.  This variable should be
7944 updated by the functions `make-coding-system' and
7945 `define-coding-system-alias'.  */);
7946   Vcoding_system_alist = Qnil;
7947
7948   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
7949                doc: /* List of coding-categories (symbols) ordered by priority.
7950
7951 On detecting a coding system, Emacs tries code detection algorithms
7952 associated with each coding-category one by one in this order.  When
7953 one algorithm agrees with a byte sequence of source text, the coding
7954 system bound to the corresponding coding-category is selected.  */);
7955   {
7956     int i;
7957
7958     Vcoding_category_list = Qnil;
7959     for (i = coding_category_max - 1; i >= 0; i--)
7960       Vcoding_category_list
7961         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
7962                  Vcoding_category_list);
7963   }
7964
7965   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
7966                doc: /* Specify the coding system for read operations.
7967 It is useful to bind this variable with `let', but do not set it globally.
7968 If the value is a coding system, it is used for decoding on read operation.
7969 If not, an appropriate element is used from one of the coding system alists:
7970 There are three such tables, `file-coding-system-alist',
7971 `process-coding-system-alist', and `network-coding-system-alist'.  */);
7972   Vcoding_system_for_read = Qnil;
7973
7974   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
7975                doc: /* Specify the coding system for write operations.
7976 Programs bind this variable with `let', but you should not set it globally.
7977 If the value is a coding system, it is used for encoding of output,
7978 when writing it to a file and when sending it to a file or subprocess.
7979
7980 If this does not specify a coding system, an appropriate element
7981 is used from one of the coding system alists:
7982 There are three such tables, `file-coding-system-alist',
7983 `process-coding-system-alist', and `network-coding-system-alist'.
7984 For output to files, if the above procedure does not specify a coding system,
7985 the value of `buffer-file-coding-system' is used.  */);
7986   Vcoding_system_for_write = Qnil;
7987
7988   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
7989                doc: /*
7990 Coding system used in the latest file or process I/O.  */);
7991   Vlast_coding_system_used = Qnil;
7992
7993   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
7994                doc: /*
7995 *Non-nil means always inhibit code conversion of end-of-line format.
7996 See info node `Coding Systems' and info node `Text and Binary' concerning
7997 such conversion.  */);
7998   inhibit_eol_conversion = 0;
7999
8000   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8001                doc: /*
8002 Non-nil means process buffer inherits coding system of process output.
8003 Bind it to t if the process output is to be treated as if it were a file
8004 read from some filesystem.  */);
8005   inherit_process_coding_system = 0;
8006
8007   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8008                doc: /*
8009 Alist to decide a coding system to use for a file I/O operation.
8010 The format is ((PATTERN . VAL) ...),
8011 where PATTERN is a regular expression matching a file name,
8012 VAL is a coding system, a cons of coding systems, or a function symbol.
8013 If VAL is a coding system, it is used for both decoding and encoding
8014 the file contents.
8015 If VAL is a cons of coding systems, the car part is used for decoding,
8016 and the cdr part is used for encoding.
8017 If VAL is a function symbol, the function must return a coding system
8018 or a cons of coding systems which are used as above.  The function gets
8019 the arguments with which `find-operation-coding-systems' was called.
8020
8021 See also the function `find-operation-coding-system'
8022 and the variable `auto-coding-alist'.  */);
8023   Vfile_coding_system_alist = Qnil;
8024
8025   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8026                doc: /*
8027 Alist to decide a coding system to use for a process I/O operation.
8028 The format is ((PATTERN . VAL) ...),
8029 where PATTERN is a regular expression matching a program name,
8030 VAL is a coding system, a cons of coding systems, or a function symbol.
8031 If VAL is a coding system, it is used for both decoding what received
8032 from the program and encoding what sent to the program.
8033 If VAL is a cons of coding systems, the car part is used for decoding,
8034 and the cdr part is used for encoding.
8035 If VAL is a function symbol, the function must return a coding system
8036 or a cons of coding systems which are used as above.
8037
8038 See also the function `find-operation-coding-system'.  */);
8039   Vprocess_coding_system_alist = Qnil;
8040
8041   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8042                doc: /*
8043 Alist to decide a coding system to use for a network I/O operation.
8044 The format is ((PATTERN . VAL) ...),
8045 where PATTERN is a regular expression matching a network service name
8046 or is a port number to connect to,
8047 VAL is a coding system, a cons of coding systems, or a function symbol.
8048 If VAL is a coding system, it is used for both decoding what received
8049 from the network stream and encoding what sent to the network stream.
8050 If VAL is a cons of coding systems, the car part is used for decoding,
8051 and the cdr part is used for encoding.
8052 If VAL is a function symbol, the function must return a coding system
8053 or a cons of coding systems which are used as above.
8054
8055 See also the function `find-operation-coding-system'.  */);
8056   Vnetwork_coding_system_alist = Qnil;
8057
8058   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8059                doc: /* Coding system to use with system messages.
8060 Also used for decoding keyboard input on X Window system.  */);
8061   Vlocale_coding_system = Qnil;
8062
8063   /* The eol mnemonics are reset in startup.el system-dependently.  */
8064   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8065                doc: /*
8066 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8067   eol_mnemonic_unix = build_string (":");
8068
8069   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8070                doc: /*
8071 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8072   eol_mnemonic_dos = build_string ("\\");
8073
8074   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8075                doc: /*
8076 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8077   eol_mnemonic_mac = build_string ("/");
8078
8079   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8080                doc: /*
8081 *String displayed in mode line when end-of-line format is not yet determined.  */);
8082   eol_mnemonic_undecided = build_string (":");
8083
8084   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8085                doc: /*
8086 *Non-nil enables character translation while encoding and decoding.  */);
8087   Venable_character_translation = Qt;
8088
8089   DEFVAR_LISP ("standard-translation-table-for-decode",
8090                &Vstandard_translation_table_for_decode,
8091                doc: /* Table for translating characters while decoding.  */);
8092   Vstandard_translation_table_for_decode = Qnil;
8093
8094   DEFVAR_LISP ("standard-translation-table-for-encode",
8095                &Vstandard_translation_table_for_encode,
8096                doc: /* Table for translating characters while encoding.  */);
8097   Vstandard_translation_table_for_encode = Qnil;
8098
8099   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8100                doc: /* Alist of charsets vs revision numbers.
8101 While encoding, if a charset (car part of an element) is found,
8102 designate it with the escape sequence identifying revision (cdr part
8103 of the element).  */);
8104   Vcharset_revision_table = Qnil;
8105
8106   DEFVAR_LISP ("default-process-coding-system",
8107                &Vdefault_process_coding_system,
8108                doc: /* Cons of coding systems used for process I/O by default.
8109 The car part is used for decoding a process output,
8110 the cdr part is used for encoding a text to be sent to a process.  */);
8111   Vdefault_process_coding_system = Qnil;
8112
8113   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8114                doc: /*
8115 Table of extra Latin codes in the range 128..159 (inclusive).
8116 This is a vector of length 256.
8117 If Nth element is non-nil, the existence of code N in a file
8118 \(or output of subprocess) doesn't prevent it to be detected as
8119 a coding system of ISO 2022 variant which has a flag
8120 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8121 or reading output of a subprocess.
8122 Only 128th through 159th elements has a meaning.  */);
8123   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8124
8125   DEFVAR_LISP ("select-safe-coding-system-function",
8126                &Vselect_safe_coding_system_function,
8127                doc: /*
8128 Function to call to select safe coding system for encoding a text.
8129
8130 If set, this function is called to force a user to select a proper
8131 coding system which can encode the text in the case that a default
8132 coding system used in each operation can't encode the text.
8133
8134 The default value is `select-safe-coding-system' (which see).  */);
8135   Vselect_safe_coding_system_function = Qnil;
8136
8137   DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table,
8138                doc: /*
8139 Char-table containing safe coding systems of each characters.
8140 Each element doesn't include such generic coding systems that can
8141 encode any characters.   They are in the first extra slot.  */);
8142   Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil);
8143
8144   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8145                &inhibit_iso_escape_detection,
8146                doc: /*
8147 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8148
8149 By default, on reading a file, Emacs tries to detect how the text is
8150 encoded.  This code detection is sensitive to escape sequences.  If
8151 the sequence is valid as ISO2022, the code is determined as one of
8152 the ISO2022 encodings, and the file is decoded by the corresponding
8153 coding system (e.g. `iso-2022-7bit').
8154
8155 However, there may be a case that you want to read escape sequences in
8156 a file as is.  In such a case, you can set this variable to non-nil.
8157 Then, as the code detection ignores any escape sequences, no file is
8158 detected as encoded in some ISO2022 encoding.  The result is that all
8159 escape sequences become visible in a buffer.
8160
8161 The default value is nil, and it is strongly recommended not to change
8162 it.  That is because many Emacs Lisp source files that contain
8163 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8164 in Emacs's distribution, and they won't be decoded correctly on
8165 reading if you suppress escape sequence detection.
8166
8167 The other way to read escape sequences in a file without decoding is
8168 to explicitly specify some coding system that doesn't use ISO2022's
8169 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8170   inhibit_iso_escape_detection = 0;
8171
8172   {
8173     Lisp_Object args[coding_arg_max];
8174     Lisp_Object plist[14];
8175     int i;
8176
8177     for (i = 0; i < coding_arg_max; i++)
8178       args[i] = Qnil;
8179
8180     plist[0] = intern (":name");
8181     plist[1] = args[coding_arg_name] = Qno_conversion;
8182     plist[2] = intern (":mnemonic");
8183     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8184     plist[4] = intern (":coding-type");
8185     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8186     plist[6] = intern (":ascii-compatible-p");
8187     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8188     plist[8] = intern (":default-char");
8189     plist[9] = args[coding_arg_default_char] = make_number (0);
8190     plist[10] = intern (":docstring");
8191     plist[11] = build_string ("Do no conversion.\n\
8192 \n\
8193 When you visit a file with this coding, the file is read into a\n\
8194 unibyte buffer as is, thus each byte of a file is treated as a\n\
8195 character.");
8196     plist[12] = intern (":eol-type");
8197     plist[13] = args[coding_arg_eol_type] = Qunix;
8198     args[coding_arg_plist] = Flist (14, plist);
8199     Fdefine_coding_system_internal (coding_arg_max, args);
8200   }
8201
8202   setup_coding_system (Qno_conversion, &keyboard_coding);
8203   setup_coding_system (Qno_conversion, &terminal_coding);
8204   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8205 }
8206
8207 char *
8208 emacs_strerror (error_number)
8209      int error_number;
8210 {
8211   char *str;
8212
8213   synchronize_system_messages_locale ();
8214   str = strerror (error_number);
8215
8216   if (! NILP (Vlocale_coding_system))
8217     {
8218       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8219                                                       Vlocale_coding_system,
8220                                                       0);
8221       str = (char *) XSTRING (dec)->data;
8222     }
8223
8224   return str;
8225 }
8226
8227 #endif /* emacs */