src/coding.c

   1 /* Coding system handler (conversion, detection, and etc).
   2    Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
   3    Licensed to the Free Software Foundation.
   4    Copyright (C) 2001 Free Software Foundation, Inc.
   5    Copyright (C) 2001, 2002
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /*** TABLE OF CONTENTS ***
  27
  28   0. General comments
  29   1. Preamble
  30   2. Emacs' internal format (emacs-utf-8) handlers
  31   3. UTF-8 handlers
  32   4. UTF-16 handlers
  33   5. Charset-base coding systems handlers
  34   6. emacs-mule (old Emacs' internal format) handlers
  35   7. ISO2022 handlers
  36   8. Shift-JIS and BIG5 handlers
  37   9. CCL handlers
  38   10. C library functions
  39   11. Emacs Lisp library functions
  40   12. Postamble
  41
  42 */
  43
  44 /*** 0. General comments ***
  45
  46
  47 CODING SYSTEM
  48
  49   A coding system is an object for an encoding mechanism that contains
  50   information about how to convert byte sequences to character
  51   sequences and vice versa.  When we say "decode", it means converting
  52   a byte sequence of a specific coding system into a character
  53   sequence that is represented by Emacs' internal coding system
  54   `emacs-utf-8', and when we say "encode", it means converting a
  55   character sequence of emacs-utf-8 to a byte sequence of a specific
  56   coding system.
  57
  58   In Emacs Lisp, a coding system is represented by a Lisp symbol.  In
  59   C level, a coding system is represented by a vector of attributes
  60   stored in the hash table Vcharset_hash_table.  The conversion from
  61   coding system symbol to attributes vector is done by looking up
  62   Vcharset_hash_table by the symbol.
  63
  64   Coding systems are classified into the following types depending on
  65   the encoding mechanism.  Here's a brief description of the types.
  66
  67   o UTF-8
  68
  69   o UTF-16
  70
  71   o Charset-base coding system
  72
  73   A coding system defined by one or more (coded) character sets.
  74   Decoding and encoding are done by a code converter defined for each
  75   character set.
  76
  77   o Old Emacs internal format (emacs-mule)
  78
  79   The coding system adopted by old versions of Emacs (20 and 21).
  80
  81   o ISO2022-base coding system
  82
  83   The most famous coding system for multiple character sets.  X's
  84   Compound Text, various EUCs (Extended Unix Code), and coding systems
  85   used in the Internet communication such as ISO-2022-JP are all
  86   variants of ISO2022.
  87
  88   o SJIS (or Shift-JIS or MS-Kanji-Code)
  89
  90   A coding system to encode character sets: ASCII, JISX0201, and
  91   JISX0208.  Widely used for PC's in Japan.  Details are described in
  92   section 8.
  93
  94   o BIG5
  95
  96   A coding system to encode character sets: ASCII and Big5.  Widely
  97   used by Chinese (mainly in Taiwan and Hong Kong).  Details are
  98   described in section 8.  In this file, when we write "big5" (all
  99   lowercase), we mean the coding system, and when we write "Big5"
 100   (capitalized), we mean the character set.
 101
 102   o CCL
 103
 104   If a user wants to decode/encode text encoded in a coding system
 105   not listed above, he can supply a decoder and an encoder for it in
 106   CCL (Code Conversion Language) programs.  Emacs executes the CCL
 107   program while decoding/encoding.
 108
 109   o Raw-text
 110
 111   A coding system for a text containing raw eight-bit data.  Emacs
 112   treats each byte of source text as a character (except for
 113   end-of-line conversion).
 114
 115   o No-conversion
 116
 117   Like raw text, but don't do end-of-line conversion.
 118
 119
 120 END-OF-LINE FORMAT
 121
 122   How text end-of-line is encoded depends on operating system.  For
 123   instance, Unix's format is just one byte of LF (line-feed) code,
 124   whereas DOS's format is two-byte sequence of `carriage-return' and
 125   `line-feed' codes.  MacOS's format is usually one byte of
 126   `carriage-return'.
 127
 128   Since text character encoding and end-of-line encoding are
 129   independent, any coding system described above can take any format
 130   of end-of-line (except for no-conversion).
 131
 132 STRUCT CODING_SYSTEM
 133
 134   Before using a coding system for code conversion (i.e. decoding and
 135   encoding), we setup a structure of type `struct coding_system'.
 136   This structure keeps various information about a specific code
 137   conversion (e.g. the location of source and destination data).
 138
 139 */
 140
 141 /* COMMON MACROS */
 142
 143
 144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
 145
 146   These functions check if a byte sequence specified as a source in
 147   CODING conforms to the format of XXX.  Return 1 if the data contains
 148   a byte sequence which can be decoded into non-ASCII characters by
 149   the coding system.  Otherwize (i.e. the data contains only ASCII
 150   characters or invalid sequence) return 0.
 151
 152   It also resets some bits of an integer pointed by MASK.  The macros
 153   CATEGORY_MASK_XXX specifies each bit of this integer.
 154
 155   Below is the template of these functions.  */
 156
 157 #if 0
 158 static int
 159 detect_coding_XXX (coding, mask)
 160      struct coding_system *coding;
 161      int *mask;
 162 {
 163   unsigned char *src = coding->source;
 164   unsigned char *src_end = coding->source + coding->src_bytes;
 165   int multibytep = coding->src_multibyte;
 166   int c;
 167   int found = 0;
 168   ...;
 169
 170   while (1)
 171     {
 172       /* Get one byte from the source.  If the souce is exausted, jump
 173          to no_more_source:.  */
 174       ONE_MORE_BYTE (c);
 175       /* Check if it conforms to XXX.  If not, break the loop.  */
 176     }
 177   /* As the data is invalid for XXX, reset a proper bits.  */
 178   *mask &= ~CODING_CATEGORY_XXX;
 179   return 0;
 180  no_more_source:
 181   /* The source exausted.  */
 182   if (!found)
 183     /* ASCII characters only. */
 184     return 0;
 185   /* Some data should be decoded into non-ASCII characters.  */
 186   *mask &= CODING_CATEGORY_XXX;
 187   return 1;
 188 }
 189 #endif
 190
 191 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
 192
 193   These functions decode a byte sequence specified as a source by
 194   CODING.  The resulting multibyte text goes to a place pointed to by
 195   CODING->charbuf, the length of which should not exceed
 196   CODING->charbuf_size;
 197
 198   These functions set the information of original and decoded texts in
 199   CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
 200   They also set CODING->result to one of CODING_RESULT_XXX indicating
 201   how the decoding is finished.
 202
 203   Below is the template of these functions.  */
 204
 205 #if 0
 206 static void
 207 decode_coding_XXXX (coding)
 208      struct coding_system *coding;
 209 {
 210   unsigned char *src = coding->source + coding->consumed;
 211   unsigned char *src_end = coding->source + coding->src_bytes;
 212   /* SRC_BASE remembers the start position in source in each loop.
 213      The loop will be exited when there's not enough source code, or
 214      when there's no room in CHARBUF for a decoded character.  */
 215   unsigned char *src_base;
 216   /* A buffer to produce decoded characters.  */
 217   int *charbuf = coding->charbuf;
 218   int *charbuf_end = charbuf + coding->charbuf_size;
 219   int multibytep = coding->src_multibyte;
 220
 221   while (1)
 222     {
 223       src_base = src;
 224       if (charbuf < charbuf_end)
 225         /* No more room to produce a decoded character.  */
 226         break;
 227       ONE_MORE_BYTE (c);
 228       /* Decode it. */
 229     }
 230
 231  no_more_source:
 232   if (src_base < src_end
 233       && coding->mode & CODING_MODE_LAST_BLOCK)
 234     /* If the source ends by partial bytes to construct a character,
 235        treat them as eight-bit raw data.  */
 236     while (src_base < src_end && charbuf < charbuf_end)
 237       *charbuf++ = *src_base++;
 238   /* Remember how many bytes and characters we consumed.  If the
 239      source is multibyte, the bytes and chars are not identical.  */
 240   coding->consumed = coding->consumed_char = src_base - coding->source;
 241   /* Remember how many characters we produced.  */
 242   coding->charbuf_used = charbuf - coding->charbuf;
 243 }
 244 #endif
 245
 246 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
 247
 248   These functions encode SRC_BYTES length text at SOURCE of Emacs'
 249   internal multibyte format by CODING.  The resulting byte sequence
 250   goes to a place pointed to by DESTINATION, the length of which
 251   should not exceed DST_BYTES.
 252
 253   These functions set the information of original and encoded texts in
 254   the members produced, produced_char, consumed, and consumed_char of
 255   the structure *CODING.  They also set the member result to one of
 256   CODING_RESULT_XXX indicating how the encoding finished.
 257
 258   DST_BYTES zero means that source area and destination area are
 259   overlapped, which means that we can produce a encoded text until it
 260   reaches at the head of not-yet-encoded source text.
 261
 262   Below is a template of these functions.  */
 263 #if 0
 264 static void
 265 encode_coding_XXX (coding)
 266      struct coding_system *coding;
 267 {
 268   int multibytep = coding->dst_multibyte;
 269   int *charbuf = coding->charbuf;
 270   int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
 271   unsigned char *dst = coding->destination + coding->produced;
 272   unsigned char *dst_end = coding->destination + coding->dst_bytes;
 273   unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
 274   int produced_chars = 0;
 275
 276   for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
 277     {
 278       int c = *charbuf;
 279       /* Encode C into DST, and increment DST.  */
 280     }
 281  label_no_more_destination:
 282   /* How many chars and bytes we produced.  */
 283   coding->produced_char += produced_chars;
 284   coding->produced = dst - coding->destination;
 285 }
 286 #endif
 287
 288 \f
 289 /*** 1. Preamble ***/
 290
 291 #include <config.h>
 292 #include <stdio.h>
 293
 294 #include "lisp.h"
 295 #include "buffer.h"
 296 #include "character.h"
 297 #include "charset.h"
 298 #include "ccl.h"
 299 #include "composite.h"
 300 #include "coding.h"
 301 #include "window.h"
 302
 303 Lisp_Object Vcoding_system_hash_table;
 304
 305 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
 306 Lisp_Object Qunix, Qdos;
 307 extern Lisp_Object Qmac;        /* frame.c */
 308 Lisp_Object Qbuffer_file_coding_system;
 309 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
 310 Lisp_Object Qdefault_char;
 311 Lisp_Object Qno_conversion, Qundecided;
 312 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
 313 Lisp_Object Qutf_16_be_nosig, Qutf_16_be, Qutf_16_le_nosig, Qutf_16_le;
 314 Lisp_Object Qsignature, Qendian, Qbig, Qlittle;
 315 Lisp_Object Qcoding_system_history;
 316 Lisp_Object Qvalid_codes;
 317
 318 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
 319 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
 320 Lisp_Object Qstart_process, Qopen_network_stream;
 321 Lisp_Object Qtarget_idx;
 322
 323 Lisp_Object Vselect_safe_coding_system_function;
 324
 325 /* Mnemonic string for each format of end-of-line.  */
 326 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
 327 /* Mnemonic string to indicate format of end-of-line is not yet
 328    decided.  */
 329 Lisp_Object eol_mnemonic_undecided;
 330
 331 #ifdef emacs
 332
 333 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
 334
 335 Lisp_Object Qcoding_system_p, Qcoding_system_error;
 336
 337 /* Coding system emacs-mule and raw-text are for converting only
 338    end-of-line format.  */
 339 Lisp_Object Qemacs_mule, Qraw_text;
 340
 341 /* Coding-systems are handed between Emacs Lisp programs and C internal
 342    routines by the following three variables.  */
 343 /* Coding-system for reading files and receiving data from process.  */
 344 Lisp_Object Vcoding_system_for_read;
 345 /* Coding-system for writing files and sending data to process.  */
 346 Lisp_Object Vcoding_system_for_write;
 347 /* Coding-system actually used in the latest I/O.  */
 348 Lisp_Object Vlast_coding_system_used;
 349
 350 /* A vector of length 256 which contains information about special
 351    Latin codes (especially for dealing with Microsoft codes).  */
 352 Lisp_Object Vlatin_extra_code_table;
 353
 354 /* Flag to inhibit code conversion of end-of-line format.  */
 355 int inhibit_eol_conversion;
 356
 357 /* Flag to inhibit ISO2022 escape sequence detection.  */
 358 int inhibit_iso_escape_detection;
 359
 360 /* Flag to make buffer-file-coding-system inherit from process-coding.  */
 361 int inherit_process_coding_system;
 362
 363 /* Coding system to be used to encode text for terminal display.  */
 364 struct coding_system terminal_coding;
 365
 366 /* Coding system to be used to encode text for terminal display when
 367    terminal coding system is nil.  */
 368 struct coding_system safe_terminal_coding;
 369
 370 /* Coding system of what is sent from terminal keyboard.  */
 371 struct coding_system keyboard_coding;
 372
 373 Lisp_Object Vfile_coding_system_alist;
 374 Lisp_Object Vprocess_coding_system_alist;
 375 Lisp_Object Vnetwork_coding_system_alist;
 376
 377 Lisp_Object Vlocale_coding_system;
 378
 379 #endif /* emacs */
 380
 381 /* Flag to tell if we look up translation table on character code
 382    conversion.  */
 383 Lisp_Object Venable_character_translation;
 384 /* Standard translation table to look up on decoding (reading).  */
 385 Lisp_Object Vstandard_translation_table_for_decode;
 386 /* Standard translation table to look up on encoding (writing).  */
 387 Lisp_Object Vstandard_translation_table_for_encode;
 388
 389 Lisp_Object Qtranslation_table;
 390 Lisp_Object Qtranslation_table_id;
 391 Lisp_Object Qtranslation_table_for_decode;
 392 Lisp_Object Qtranslation_table_for_encode;
 393
 394 /* Alist of charsets vs revision number.  */
 395 static Lisp_Object Vcharset_revision_table;
 396
 397 /* Default coding systems used for process I/O.  */
 398 Lisp_Object Vdefault_process_coding_system;
 399
 400 /* Global flag to tell that we can't call post-read-conversion and
 401    pre-write-conversion functions.  Usually the value is zero, but it
 402    is set to 1 temporarily while such functions are running.  This is
 403    to avoid infinite recursive call.  */
 404 static int inhibit_pre_post_conversion;
 405
 406 /* Two special coding systems.  */
 407 Lisp_Object Vsjis_coding_system;
 408 Lisp_Object Vbig5_coding_system;
 409
 410
 411 static int detect_coding_utf_8 P_ ((struct coding_system *, int *));
 412 static void decode_coding_utf_8 P_ ((struct coding_system *));
 413 static int encode_coding_utf_8 P_ ((struct coding_system *));
 414
 415 static int detect_coding_utf_16 P_ ((struct coding_system *, int *));
 416 static void decode_coding_utf_16 P_ ((struct coding_system *));
 417 static int encode_coding_utf_16 P_ ((struct coding_system *));
 418
 419 static int detect_coding_iso_2022 P_ ((struct coding_system *, int *));
 420 static void decode_coding_iso_2022 P_ ((struct coding_system *));
 421 static int encode_coding_iso_2022 P_ ((struct coding_system *));
 422
 423 static int detect_coding_emacs_mule P_ ((struct coding_system *, int *));
 424 static void decode_coding_emacs_mule P_ ((struct coding_system *));
 425 static int encode_coding_emacs_mule P_ ((struct coding_system *));
 426
 427 static int detect_coding_sjis P_ ((struct coding_system *, int *));
 428 static void decode_coding_sjis P_ ((struct coding_system *));
 429 static int encode_coding_sjis P_ ((struct coding_system *));
 430
 431 static int detect_coding_big5 P_ ((struct coding_system *, int *));
 432 static void decode_coding_big5 P_ ((struct coding_system *));
 433 static int encode_coding_big5 P_ ((struct coding_system *));
 434
 435 static int detect_coding_ccl P_ ((struct coding_system *, int *));
 436 static void decode_coding_ccl P_ ((struct coding_system *));
 437 static int encode_coding_ccl P_ ((struct coding_system *));
 438
 439 static void decode_coding_raw_text P_ ((struct coding_system *));
 440 static int encode_coding_raw_text P_ ((struct coding_system *));
 441
 442
 443 /* ISO2022 section */
 444
 445 #define CODING_ISO_INITIAL(coding, reg)                 \
 446   (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id),    \
 447                      coding_attr_iso_initial),          \
 448                reg)))
 449
 450
 451 #define CODING_ISO_REQUEST(coding, charset_id)  \
 452   ((charset_id <= (coding)->max_charset_id      \
 453     ? (coding)->safe_charsets[charset_id]       \
 454     : -1))
 455
 456
 457 #define CODING_ISO_FLAGS(coding)        \
 458   ((coding)->spec.iso_2022.flags)
 459 #define CODING_ISO_DESIGNATION(coding, reg)     \
 460   ((coding)->spec.iso_2022.current_designation[reg])
 461 #define CODING_ISO_INVOCATION(coding, plane)    \
 462   ((coding)->spec.iso_2022.current_invocation[plane])
 463 #define CODING_ISO_SINGLE_SHIFTING(coding)      \
 464   ((coding)->spec.iso_2022.single_shifting)
 465 #define CODING_ISO_BOL(coding)  \
 466   ((coding)->spec.iso_2022.bol)
 467 #define CODING_ISO_INVOKED_CHARSET(coding, plane)       \
 468   CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
 469
 470 /* Control characters of ISO2022.  */
 471                         /* code */      /* function */
 472 #define ISO_CODE_LF     0x0A            /* line-feed */
 473 #define ISO_CODE_CR     0x0D            /* carriage-return */
 474 #define ISO_CODE_SO     0x0E            /* shift-out */
 475 #define ISO_CODE_SI     0x0F            /* shift-in */
 476 #define ISO_CODE_SS2_7  0x19            /* single-shift-2 for 7-bit code */
 477 #define ISO_CODE_ESC    0x1B            /* escape */
 478 #define ISO_CODE_SS2    0x8E            /* single-shift-2 */
 479 #define ISO_CODE_SS3    0x8F            /* single-shift-3 */
 480 #define ISO_CODE_CSI    0x9B            /* control-sequence-introducer */
 481
 482 /* All code (1-byte) of ISO2022 is classified into one of the
 483    followings.  */
 484 enum iso_code_class_type
 485   {
 486     ISO_control_0,              /* Control codes in the range
 487                                    0x00..0x1F and 0x7F, except for the
 488                                    following 5 codes.  */
 489     ISO_carriage_return,        /* ISO_CODE_CR (0x0D) */
 490     ISO_shift_out,              /* ISO_CODE_SO (0x0E) */
 491     ISO_shift_in,               /* ISO_CODE_SI (0x0F) */
 492     ISO_single_shift_2_7,       /* ISO_CODE_SS2_7 (0x19) */
 493     ISO_escape,                 /* ISO_CODE_SO (0x1B) */
 494     ISO_control_1,              /* Control codes in the range
 495                                    0x80..0x9F, except for the
 496                                    following 3 codes.  */
 497     ISO_single_shift_2,         /* ISO_CODE_SS2 (0x8E) */
 498     ISO_single_shift_3,         /* ISO_CODE_SS3 (0x8F) */
 499     ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
 500     ISO_0x20_or_0x7F,           /* Codes of the values 0x20 or 0x7F.  */
 501     ISO_graphic_plane_0,        /* Graphic codes in the range 0x21..0x7E.  */
 502     ISO_0xA0_or_0xFF,           /* Codes of the values 0xA0 or 0xFF.  */
 503     ISO_graphic_plane_1         /* Graphic codes in the range 0xA1..0xFE.  */
 504   };
 505
 506 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
 507     `iso-flags' attribute of an iso2022 coding system.  */
 508
 509 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
 510    instead of the correct short-form sequence (e.g. ESC $ A).  */
 511 #define CODING_ISO_FLAG_LONG_FORM       0x0001
 512
 513 /* If set, reset graphic planes and registers at end-of-line to the
 514    initial state.  */
 515 #define CODING_ISO_FLAG_RESET_AT_EOL    0x0002
 516
 517 /* If set, reset graphic planes and registers before any control
 518    characters to the initial state.  */
 519 #define CODING_ISO_FLAG_RESET_AT_CNTL   0x0004
 520
 521 /* If set, encode by 7-bit environment.  */
 522 #define CODING_ISO_FLAG_SEVEN_BITS      0x0008
 523
 524 /* If set, use locking-shift function.  */
 525 #define CODING_ISO_FLAG_LOCKING_SHIFT   0x0010
 526
 527 /* If set, use single-shift function.  Overwrite
 528    CODING_ISO_FLAG_LOCKING_SHIFT.  */
 529 #define CODING_ISO_FLAG_SINGLE_SHIFT    0x0020
 530
 531 /* If set, use designation escape sequence.  */
 532 #define CODING_ISO_FLAG_DESIGNATION     0x0040
 533
 534 /* If set, produce revision number sequence.  */
 535 #define CODING_ISO_FLAG_REVISION        0x0080
 536
 537 /* If set, produce ISO6429's direction specifying sequence.  */
 538 #define CODING_ISO_FLAG_DIRECTION       0x0100
 539
 540 /* If set, assume designation states are reset at beginning of line on
 541    output.  */
 542 #define CODING_ISO_FLAG_INIT_AT_BOL     0x0200
 543
 544 /* If set, designation sequence should be placed at beginning of line
 545    on output.  */
 546 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
 547
 548 /* If set, do not encode unsafe charactes on output.  */
 549 #define CODING_ISO_FLAG_SAFE            0x0800
 550
 551 /* If set, extra latin codes (128..159) are accepted as a valid code
 552    on input.  */
 553 #define CODING_ISO_FLAG_LATIN_EXTRA     0x1000
 554
 555 #define CODING_ISO_FLAG_COMPOSITION     0x2000
 556
 557 #define CODING_ISO_FLAG_EUC_TW_SHIFT    0x4000
 558
 559 #define CODING_ISO_FLAG_USE_ROMAN       0x8000
 560
 561 #define CODING_ISO_FLAG_USE_OLDJIS      0x10000
 562
 563 #define CODING_ISO_FLAG_FULL_SUPPORT    0x100000
 564
 565 /* A character to be produced on output if encoding of the original
 566    character is prohibited by CODING_ISO_FLAG_SAFE.  */
 567 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION  '?'
 568
 569
 570 /* UTF-16 section */
 571 #define CODING_UTF_16_BOM(coding)       \
 572   ((coding)->spec.utf_16.bom)
 573
 574 #define CODING_UTF_16_ENDIAN(coding)    \
 575   ((coding)->spec.utf_16.endian)
 576
 577 #define CODING_UTF_16_SURROGATE(coding) \
 578   ((coding)->spec.utf_16.surrogate)
 579
 580
 581 /* CCL section */
 582 #define CODING_CCL_DECODER(coding)      \
 583   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
 584 #define CODING_CCL_ENCODER(coding)      \
 585   AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
 586 #define CODING_CCL_VALIDS(coding)                                          \
 587   (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
 588    ->data)
 589
 590 /* Index for each coding category in `coding_category_table' */
 591
 592 enum coding_category
 593   {
 594     coding_category_iso_7,
 595     coding_category_iso_7_tight,
 596     coding_category_iso_8_1,
 597     coding_category_iso_8_2,
 598     coding_category_iso_7_else,
 599     coding_category_iso_8_else,
 600     coding_category_utf_8,
 601     coding_category_utf_16_auto,
 602     coding_category_utf_16_be,
 603     coding_category_utf_16_le,
 604     coding_category_utf_16_be_nosig,
 605     coding_category_utf_16_le_nosig,
 606     coding_category_charset,
 607     coding_category_sjis,
 608     coding_category_big5,
 609     coding_category_ccl,
 610     coding_category_emacs_mule,
 611     /* All above are targets of code detection.  */
 612     coding_category_raw_text,
 613     coding_category_undecided,
 614     coding_category_max
 615   };
 616
 617 /* Definitions of flag bits used in detect_coding_XXXX.  */
 618 #define CATEGORY_MASK_ISO_7             (1 << coding_category_iso_7)
 619 #define CATEGORY_MASK_ISO_7_TIGHT       (1 << coding_category_iso_7_tight)
 620 #define CATEGORY_MASK_ISO_8_1           (1 << coding_category_iso_8_1)
 621 #define CATEGORY_MASK_ISO_8_2           (1 << coding_category_iso_8_2)
 622 #define CATEGORY_MASK_ISO_7_ELSE        (1 << coding_category_iso_7_else)
 623 #define CATEGORY_MASK_ISO_8_ELSE        (1 << coding_category_iso_8_else)
 624 #define CATEGORY_MASK_UTF_8             (1 << coding_category_utf_8)
 625 #define CATEGORY_MASK_UTF_16_BE         (1 << coding_category_utf_16_be)
 626 #define CATEGORY_MASK_UTF_16_LE         (1 << coding_category_utf_16_le)
 627 #define CATEGORY_MASK_UTF_16_BE_NOSIG   (1 << coding_category_utf_16_be_nosig)
 628 #define CATEGORY_MASK_UTF_16_LE_NOSIG   (1 << coding_category_utf_16_le_nosig)
 629 #define CATEGORY_MASK_CHARSET           (1 << coding_category_charset)
 630 #define CATEGORY_MASK_SJIS              (1 << coding_category_sjis)
 631 #define CATEGORY_MASK_BIG5              (1 << coding_category_big5)
 632 #define CATEGORY_MASK_CCL               (1 << coding_category_ccl)
 633 #define CATEGORY_MASK_EMACS_MULE        (1 << coding_category_emacs_mule)
 634
 635 /* This value is returned if detect_coding_mask () find nothing other
 636    than ASCII characters.  */
 637 #define CATEGORY_MASK_ANY               \
 638   (CATEGORY_MASK_ISO_7                  \
 639    | CATEGORY_MASK_ISO_7_TIGHT          \
 640    | CATEGORY_MASK_ISO_8_1              \
 641    | CATEGORY_MASK_ISO_8_2              \
 642    | CATEGORY_MASK_ISO_7_ELSE           \
 643    | CATEGORY_MASK_ISO_8_ELSE           \
 644    | CATEGORY_MASK_UTF_8                \
 645    | CATEGORY_MASK_UTF_16_BE            \
 646    | CATEGORY_MASK_UTF_16_LE            \
 647    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 648    | CATEGORY_MASK_UTF_16_LE_NOSIG      \
 649    | CATEGORY_MASK_CHARSET              \
 650    | CATEGORY_MASK_SJIS                 \
 651    | CATEGORY_MASK_BIG5                 \
 652    | CATEGORY_MASK_CCL                  \
 653    | CATEGORY_MASK_EMACS_MULE)
 654
 655
 656 #define CATEGORY_MASK_ISO_7BIT \
 657   (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
 658
 659 #define CATEGORY_MASK_ISO_8BIT \
 660   (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
 661
 662 #define CATEGORY_MASK_ISO_ELSE \
 663   (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
 664
 665 #define CATEGORY_MASK_ISO_ESCAPE        \
 666   (CATEGORY_MASK_ISO_7                  \
 667    | CATEGORY_MASK_ISO_7_TIGHT          \
 668    | CATEGORY_MASK_ISO_7_ELSE           \
 669    | CATEGORY_MASK_ISO_8_ELSE)
 670
 671 #define CATEGORY_MASK_ISO       \
 672   (  CATEGORY_MASK_ISO_7BIT     \
 673      | CATEGORY_MASK_ISO_8BIT   \
 674      | CATEGORY_MASK_ISO_ELSE)
 675
 676 #define CATEGORY_MASK_UTF_16            \
 677   (CATEGORY_MASK_UTF_16_BE              \
 678    | CATEGORY_MASK_UTF_16_LE            \
 679    | CATEGORY_MASK_UTF_16_BE_NOSIG      \
 680    | CATEGORY_MASK_UTF_16_LE_NOSIG)
 681
 682
 683 /* List of symbols `coding-category-xxx' ordered by priority.  This
 684    variable is exposed to Emacs Lisp.  */
 685 static Lisp_Object Vcoding_category_list;
 686
 687 /* Table of coding categories (Lisp symbols).  This variable is for
 688    internal use oly.  */
 689 static Lisp_Object Vcoding_category_table;
 690
 691 /* Table of coding-categories ordered by priority.  */
 692 static enum coding_category coding_priorities[coding_category_max];
 693
 694 /* Nth element is a coding context for the coding system bound to the
 695    Nth coding category.  */
 696 static struct coding_system coding_categories[coding_category_max];
 697
 698 static int detected_mask[coding_category_raw_text] =
 699   { CATEGORY_MASK_ISO,
 700     CATEGORY_MASK_ISO,
 701     CATEGORY_MASK_ISO,
 702     CATEGORY_MASK_ISO,
 703     CATEGORY_MASK_ISO,
 704     CATEGORY_MASK_ISO,
 705     CATEGORY_MASK_UTF_8,
 706     CATEGORY_MASK_UTF_16,
 707     CATEGORY_MASK_UTF_16,
 708     CATEGORY_MASK_UTF_16,
 709     CATEGORY_MASK_UTF_16,
 710     CATEGORY_MASK_UTF_16,
 711     CATEGORY_MASK_CHARSET,
 712     CATEGORY_MASK_SJIS,
 713     CATEGORY_MASK_BIG5,
 714     CATEGORY_MASK_CCL,
 715     CATEGORY_MASK_EMACS_MULE
 716   };
 717
 718 /*** Commonly used macros and functions ***/
 719
 720 #ifndef min
 721 #define min(a, b) ((a) < (b) ? (a) : (b))
 722 #endif
 723 #ifndef max
 724 #define max(a, b) ((a) > (b) ? (a) : (b))
 725 #endif
 726
 727 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list)  \
 728   do {                                                          \
 729     attrs = CODING_ID_ATTRS (coding->id);                       \
 730     eol_type = CODING_ID_EOL_TYPE (coding->id);                 \
 731     if (VECTORP (eol_type))                                     \
 732       eol_type = Qunix;                                         \
 733     charset_list = CODING_ATTR_CHARSET_LIST (attrs);            \
 734   } while (0)
 735
 736
 737 /* Safely get one byte from the source text pointed by SRC which ends
 738    at SRC_END, and set C to that byte.  If there are not enough bytes
 739    in the source, it jumps to `no_more_source'.  The caller
 740    should declare and set these variables appropriately in advance:
 741         src, src_end, multibytep
 742 */
 743
 744 #define ONE_MORE_BYTE(c)                                        \
 745   do {                                                          \
 746     if (src == src_end)                                         \
 747       {                                                         \
 748         if (src_base < src)                                     \
 749           coding->result = CODING_RESULT_INSUFFICIENT_SRC;      \
 750         goto no_more_source;                                    \
 751       }                                                         \
 752     c = *src++;                                                 \
 753     if (multibytep && (c & 0x80))                               \
 754       {                                                         \
 755         if ((c & 0xFE) != 0xC0)                                 \
 756           error ("Undecodable char found");                     \
 757         c = ((c & 1) << 6) | *src++;                            \
 758       }                                                         \
 759     consumed_chars++;                                           \
 760   } while (0)
 761
 762
 763 #define ONE_MORE_BYTE_NO_CHECK(c)               \
 764   do {                                          \
 765     c = *src++;                                 \
 766     if (multibytep && (c & 0x80))               \
 767       {                                         \
 768         if ((c & 0xFE) != 0xC0)                 \
 769           error ("Undecodable char found");     \
 770         c = ((c & 1) << 6) | *src++;            \
 771       }                                         \
 772     consumed_chars++;                           \
 773   } while (0)
 774
 775
 776 /* Store a byte C in the place pointed by DST and increment DST to the
 777    next free point, and increment PRODUCED_CHARS.  The caller should
 778    assure that C is 0..127, and declare and set the variable `dst'
 779    appropriately in advance.
 780 */
 781
 782
 783 #define EMIT_ONE_ASCII_BYTE(c)  \
 784   do {                          \
 785     produced_chars++;           \
 786     *dst++ = (c);               \
 787   } while (0)
 788
 789
 790 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2.  */
 791
 792 #define EMIT_TWO_ASCII_BYTES(c1, c2)    \
 793   do {                                  \
 794     produced_chars += 2;                \
 795     *dst++ = (c1), *dst++ = (c2);       \
 796   } while (0)
 797
 798
 799 /* Store a byte C in the place pointed by DST and increment DST to the
 800    next free point, and increment PRODUCED_CHARS.  If MULTIBYTEP is
 801    nonzero, store in an appropriate multibyte from.  The caller should
 802    declare and set the variables `dst' and `multibytep' appropriately
 803    in advance.  */
 804
 805 #define EMIT_ONE_BYTE(c)                \
 806   do {                                  \
 807     produced_chars++;                   \
 808     if (multibytep)                     \
 809       {                                 \
 810         int ch = (c);                   \
 811         if (ch >= 0x80)                 \
 812           ch = BYTE8_TO_CHAR (ch);      \
 813         CHAR_STRING_ADVANCE (ch, dst);  \
 814       }                                 \
 815     else                                \
 816       *dst++ = (c);                     \
 817   } while (0)
 818
 819
 820 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2.  */
 821
 822 #define EMIT_TWO_BYTES(c1, c2)          \
 823   do {                                  \
 824     produced_chars += 2;                \
 825     if (multibytep)                     \
 826       {                                 \
 827         int ch;                         \
 828                                         \
 829         ch = (c1);                      \
 830         if (ch >= 0x80)                 \
 831           ch = BYTE8_TO_CHAR (ch);      \
 832         CHAR_STRING_ADVANCE (ch, dst);  \
 833         ch = (c2);                      \
 834         if (ch >= 0x80)                 \
 835           ch = BYTE8_TO_CHAR (ch);      \
 836         CHAR_STRING_ADVANCE (ch, dst);  \
 837       }                                 \
 838     else                                \
 839       {                                 \
 840         *dst++ = (c1);                  \
 841         *dst++ = (c2);                  \
 842       }                                 \
 843   } while (0)
 844
 845
 846 #define EMIT_THREE_BYTES(c1, c2, c3)    \
 847   do {                                  \
 848     EMIT_ONE_BYTE (c1);                 \
 849     EMIT_TWO_BYTES (c2, c3);            \
 850   } while (0)
 851
 852
 853 #define EMIT_FOUR_BYTES(c1, c2, c3, c4)         \
 854   do {                                          \
 855     EMIT_TWO_BYTES (c1, c2);                    \
 856     EMIT_TWO_BYTES (c3, c4);                    \
 857   } while (0)
 858
 859
 860 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
 861   do {                                                                       \
 862     charset_map_loaded = 0;                                                  \
 863     c = DECODE_CHAR (charset, code);                                         \
 864     if (charset_map_loaded)                                                  \
 865       {                                                                      \
 866         unsigned char *orig = coding->source;                                \
 867         EMACS_INT offset;                                                    \
 868                                                                              \
 869         coding_set_source (coding);                                          \
 870         offset = coding->source - orig;                                      \
 871         src += offset;                                                       \
 872         src_base += offset;                                                  \
 873         src_end += offset;                                                   \
 874       }                                                                      \
 875   } while (0)
 876
 877
 878 #define ASSURE_DESTINATION(bytes)                               \
 879   do {                                                          \
 880     if (dst + (bytes) >= dst_end)                               \
 881       {                                                         \
 882         int more_bytes = charbuf_end - charbuf + (bytes);       \
 883                                                                 \
 884         dst = alloc_destination (coding, more_bytes, dst);      \
 885         dst_end = coding->destination + coding->dst_bytes;      \
 886       }                                                         \
 887   } while (0)
 888
 889
 890
 891 static void
 892 coding_set_source (coding)
 893      struct coding_system *coding;
 894 {
 895   if (BUFFERP (coding->src_object))
 896     {
 897       if (coding->src_pos < 0)
 898         coding->source = GAP_END_ADDR + coding->src_pos_byte;
 899       else
 900         {
 901           struct buffer *buf = XBUFFER (coding->src_object);
 902           EMACS_INT gpt_byte = BUF_GPT_BYTE (buf);
 903           unsigned char *beg_addr = BUF_BEG_ADDR (buf);
 904
 905           coding->source = beg_addr + coding->src_pos_byte - 1;
 906           if (coding->src_pos_byte >= gpt_byte)
 907             coding->source += BUF_GAP_SIZE (buf);
 908         }
 909     }
 910   else if (STRINGP (coding->src_object))
 911     {
 912       coding->source = (XSTRING (coding->src_object)->data
 913                         + coding->src_pos_byte);
 914     }
 915   else
 916     /* Otherwise, the source is C string and is never relocated
 917        automatically.  Thus we don't have to update anything.  */
 918     ;
 919 }
 920
 921 static void
 922 coding_set_destination (coding)
 923      struct coding_system *coding;
 924 {
 925   if (BUFFERP (coding->dst_object))
 926     {
 927       /* We are sure that coding->dst_pos_byte is before the gap of the
 928          buffer. */
 929       coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
 930                              + coding->dst_pos_byte - 1);
 931       if (coding->src_pos < 0)
 932         coding->dst_bytes = (GAP_END_ADDR
 933                              - (coding->src_bytes - coding->consumed)
 934                              - coding->destination);
 935       else
 936         coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
 937                              - coding->destination);
 938     }
 939   else
 940     /* Otherwise, the destination is C string and is never relocated
 941        automatically.  Thus we don't have to update anything.  */
 942     ;
 943 }
 944
 945
 946 static void
 947 coding_alloc_by_realloc (coding, bytes)
 948      struct coding_system *coding;
 949      EMACS_INT bytes;
 950 {
 951   coding->destination = (unsigned char *) xrealloc (coding->destination,
 952                                                     coding->dst_bytes + bytes);
 953   coding->dst_bytes += bytes;
 954 }
 955
 956 static void
 957 coding_alloc_by_making_gap (coding, bytes)
 958      struct coding_system *coding;
 959      EMACS_INT bytes;
 960 {
 961   if (BUFFERP (coding->dst_object)
 962       && EQ (coding->src_object, coding->dst_object))
 963     {
 964       EMACS_INT add = coding->src_bytes - coding->consumed;
 965
 966       GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
 967       make_gap (bytes);
 968       GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
 969     }
 970   else
 971     {
 972       Lisp_Object this_buffer;
 973
 974       this_buffer = Fcurrent_buffer ();
 975       set_buffer_internal (XBUFFER (coding->dst_object));
 976       make_gap (bytes);
 977       set_buffer_internal (XBUFFER (this_buffer));
 978     }
 979 }
 980
 981
 982 static unsigned char *
 983 alloc_destination (coding, nbytes, dst)
 984      struct coding_system *coding;
 985      int nbytes;
 986      unsigned char *dst;
 987 {
 988   EMACS_INT offset = dst - coding->destination;
 989
 990   if (BUFFERP (coding->dst_object))
 991     coding_alloc_by_making_gap (coding, nbytes);
 992   else
 993     coding_alloc_by_realloc (coding, nbytes);
 994   coding->result = CODING_RESULT_SUCCESS;
 995   coding_set_destination (coding);
 996   dst = coding->destination + offset;
 997   return dst;
 998 }
 999
1000 \f
1001 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1002
1003
1004
1005 \f
1006 /*** 3. UTF-8 ***/
1007
1008 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1009    Check if a text is encoded in UTF-8.  If it is, return
1010    CATEGORY_MASK_UTF_8, else return 0.  */
1011
1012 #define UTF_8_1_OCTET_P(c)         ((c) < 0x80)
1013 #define UTF_8_EXTRA_OCTET_P(c)     (((c) & 0xC0) == 0x80)
1014 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1015 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1016 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1017 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1018
1019 static int
1020 detect_coding_utf_8 (coding, mask)
1021      struct coding_system *coding;
1022      int *mask;
1023 {
1024   unsigned char *src = coding->source, *src_base = src;
1025   unsigned char *src_end = coding->source + coding->src_bytes;
1026   int multibytep = coding->src_multibyte;
1027   int consumed_chars = 0;
1028   int found = 0;
1029
1030   /* A coding system of this category is always ASCII compatible.  */
1031   src += coding->head_ascii;
1032
1033   while (1)
1034     {
1035       int c, c1, c2, c3, c4;
1036
1037       ONE_MORE_BYTE (c);
1038       if (UTF_8_1_OCTET_P (c))
1039         continue;
1040       ONE_MORE_BYTE (c1);
1041       if (! UTF_8_EXTRA_OCTET_P (c1))
1042         break;
1043       if (UTF_8_2_OCTET_LEADING_P (c))
1044         {
1045           found++;
1046           continue;
1047         }
1048       ONE_MORE_BYTE (c2);
1049       if (! UTF_8_EXTRA_OCTET_P (c2))
1050         break;
1051       if (UTF_8_3_OCTET_LEADING_P (c))
1052         {
1053           found++;
1054           continue;
1055         }
1056       ONE_MORE_BYTE (c3);
1057       if (! UTF_8_EXTRA_OCTET_P (c3))
1058         break;
1059       if (UTF_8_4_OCTET_LEADING_P (c))
1060         {
1061           found++;
1062           continue;
1063         }
1064       ONE_MORE_BYTE (c4);
1065       if (! UTF_8_EXTRA_OCTET_P (c4))
1066         break;
1067       if (UTF_8_5_OCTET_LEADING_P (c))
1068         {
1069           found++;
1070           continue;
1071         }
1072       break;
1073     }
1074   *mask &= ~CATEGORY_MASK_UTF_8;
1075   return 0;
1076
1077  no_more_source:
1078   if (! found)
1079     return 0;
1080   *mask &= CATEGORY_MASK_UTF_8;
1081   return 1;
1082 }
1083
1084
1085 /* Fixme: deal with surrogates?  */
1086 static void
1087 decode_coding_utf_8 (coding)
1088      struct coding_system *coding;
1089 {
1090   unsigned char *src = coding->source + coding->consumed;
1091   unsigned char *src_end = coding->source + coding->src_bytes;
1092   unsigned char *src_base;
1093   int *charbuf = coding->charbuf;
1094   int *charbuf_end = charbuf + coding->charbuf_size;
1095   int consumed_chars = 0, consumed_chars_base;
1096   int multibytep = coding->src_multibyte;
1097   Lisp_Object attr, eol_type, charset_list;
1098
1099   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1100
1101   while (1)
1102     {
1103       int c, c1, c2, c3, c4, c5;
1104
1105       src_base = src;
1106       consumed_chars_base = consumed_chars;
1107
1108       if (charbuf >= charbuf_end)
1109         break;
1110
1111       ONE_MORE_BYTE (c1);
1112       if (UTF_8_1_OCTET_P(c1))
1113         {
1114           c = c1;
1115           if (c == '\r')
1116             {
1117               if (EQ (eol_type, Qdos))
1118                 {
1119                   if (src == src_end)
1120                     goto no_more_source;
1121                   if (*src == '\n')
1122                     ONE_MORE_BYTE (c);
1123                 }
1124               else if (EQ (eol_type, Qmac))
1125                 c = '\n';
1126             }
1127         }
1128       else
1129         {
1130           ONE_MORE_BYTE (c2);
1131           if (! UTF_8_EXTRA_OCTET_P (c2))
1132             goto invalid_code;
1133           if (UTF_8_2_OCTET_LEADING_P (c1))
1134             {
1135               c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1136               /* Reject overlong sequences here and below.  Encoders
1137                  producing them are incorrect, they can be misleading,
1138                  and they mess up read/write invariance.  */
1139               if (c < 128)
1140                 goto invalid_code;
1141             }
1142           else
1143             {
1144               ONE_MORE_BYTE (c3);
1145               if (! UTF_8_EXTRA_OCTET_P (c3))
1146                 goto invalid_code;
1147               if (UTF_8_3_OCTET_LEADING_P (c1))
1148                 {
1149                   c = (((c1 & 0xF) << 12)
1150                        | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1151                   if (c < 0x800)
1152                     goto invalid_code;
1153                 }
1154               else
1155                 {
1156                   ONE_MORE_BYTE (c4);
1157                   if (! UTF_8_EXTRA_OCTET_P (c4))
1158                     goto invalid_code;
1159                   if (UTF_8_4_OCTET_LEADING_P (c1))
1160                     {
1161                     c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1162                          | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1163                     if (c < 0x10000)
1164                       goto invalid_code;
1165                     }
1166                   else
1167                     {
1168                       ONE_MORE_BYTE (c5);
1169                       if (! UTF_8_EXTRA_OCTET_P (c5))
1170                         goto invalid_code;
1171                       if (UTF_8_5_OCTET_LEADING_P (c1))
1172                         {
1173                           c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1174                                | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1175                                | (c5 & 0x3F));
1176                           if ((c > MAX_CHAR) || (c < 0x200000))
1177                             goto invalid_code;
1178                         }
1179                       else
1180                         goto invalid_code;
1181                     }
1182                 }
1183             }
1184         }
1185
1186       *charbuf++ = c;
1187       continue;
1188
1189     invalid_code:
1190       src = src_base;
1191       consumed_chars = consumed_chars_base;
1192       ONE_MORE_BYTE (c);
1193       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1194       coding->errors++;
1195     }
1196
1197  no_more_source:
1198   coding->consumed_char += consumed_chars_base;
1199   coding->consumed = src_base - coding->source;
1200   coding->charbuf_used = charbuf - coding->charbuf;
1201 }
1202
1203
1204 static int
1205 encode_coding_utf_8 (coding)
1206      struct coding_system *coding;
1207 {
1208   int multibytep = coding->dst_multibyte;
1209   int *charbuf = coding->charbuf;
1210   int *charbuf_end = charbuf + coding->charbuf_used;
1211   unsigned char *dst = coding->destination + coding->produced;
1212   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1213   int produced_chars = 0;
1214   int c;
1215
1216   if (multibytep)
1217     {
1218       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1219
1220       while (charbuf < charbuf_end)
1221         {
1222           unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1223
1224           ASSURE_DESTINATION (safe_room);
1225           c = *charbuf++;
1226           CHAR_STRING_ADVANCE (c, pend);
1227           for (p = str; p < pend; p++)
1228             EMIT_ONE_BYTE (*p);
1229         }
1230     }
1231   else
1232     {
1233       int safe_room = MAX_MULTIBYTE_LENGTH;
1234
1235       while (charbuf < charbuf_end)
1236         {
1237           ASSURE_DESTINATION (safe_room);
1238           c = *charbuf++;
1239           dst += CHAR_STRING (c, dst);
1240           produced_chars++;
1241         }
1242     }
1243   coding->result = CODING_RESULT_SUCCESS;
1244   coding->produced_char += produced_chars;
1245   coding->produced = dst - coding->destination;
1246   return 0;
1247 }
1248
1249
1250 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1251    Check if a text is encoded in UTF-16 Big Endian (endian == 1) or
1252    Little Endian (otherwise).  If it is, return
1253    CATEGORY_MASK_UTF_16_BE or CATEGORY_MASK_UTF_16_LE,
1254    else return 0.  */
1255
1256 #define UTF_16_HIGH_SURROGATE_P(val) \
1257   (((val) & 0xFC00) == 0xD800)
1258
1259 #define UTF_16_LOW_SURROGATE_P(val) \
1260   (((val) & 0xFC00) == 0xDC00)
1261
1262 #define UTF_16_INVALID_P(val)   \
1263   (((val) == 0xFFFE)            \
1264    || ((val) == 0xFFFF)         \
1265    || UTF_16_LOW_SURROGATE_P (val))
1266
1267
1268 static int
1269 detect_coding_utf_16 (coding, mask)
1270      struct coding_system *coding;
1271      int *mask;
1272 {
1273   unsigned char *src = coding->source, *src_base = src;
1274   unsigned char *src_end = coding->source + coding->src_bytes;
1275   int multibytep = coding->src_multibyte;
1276   int consumed_chars = 0;
1277   int c1, c2;
1278
1279   ONE_MORE_BYTE (c1);
1280   ONE_MORE_BYTE (c2);
1281
1282   if ((c1 == 0xFF) && (c2 == 0xFE))
1283     {
1284       *mask &= CATEGORY_MASK_UTF_16_LE;
1285       return 1;
1286     }
1287   else if ((c1 == 0xFE) && (c2 == 0xFF))
1288     {
1289       *mask &= CATEGORY_MASK_UTF_16_BE;
1290       return 1;
1291     }
1292  no_more_source:
1293   return 0;
1294 }
1295
1296 static void
1297 decode_coding_utf_16 (coding)
1298      struct coding_system *coding;
1299 {
1300   unsigned char *src = coding->source + coding->consumed;
1301   unsigned char *src_end = coding->source + coding->src_bytes;
1302   unsigned char *src_base;
1303   int *charbuf = coding->charbuf;
1304   int *charbuf_end = charbuf + coding->charbuf_size;
1305   int consumed_chars = 0, consumed_chars_base;
1306   int multibytep = coding->src_multibyte;
1307   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1308   enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1309   int surrogate = CODING_UTF_16_SURROGATE (coding);
1310   Lisp_Object attr, eol_type, charset_list;
1311
1312   CODING_GET_INFO (coding, attr, eol_type, charset_list);
1313
1314   if (bom != utf_16_without_bom)
1315     {
1316       int c, c1, c2;
1317
1318       src_base = src;
1319       ONE_MORE_BYTE (c1);
1320       ONE_MORE_BYTE (c2);
1321       c = (c1 << 8) | c2;
1322       if (bom == utf_16_with_bom)
1323         {
1324           if (endian == utf_16_big_endian
1325               ? c != 0xFFFE : c != 0xFEFF)
1326             {
1327               /* We are sure that there's enouph room at CHARBUF.  */
1328               *charbuf++ = c1;
1329               *charbuf++ = c2;
1330               coding->errors++;
1331             }
1332         }
1333       else
1334         {
1335           if (c == 0xFFFE)
1336             CODING_UTF_16_ENDIAN (coding)
1337               = endian = utf_16_big_endian;
1338           else if (c == 0xFEFF)
1339             CODING_UTF_16_ENDIAN (coding)
1340               = endian = utf_16_little_endian;
1341           else
1342             {
1343               CODING_UTF_16_ENDIAN (coding)
1344                 = endian = utf_16_big_endian;
1345               src = src_base;
1346             }
1347         }
1348       CODING_UTF_16_BOM (coding) = utf_16_with_bom;
1349     }
1350
1351   while (1)
1352     {
1353       int c, c1, c2;
1354
1355       src_base = src;
1356       consumed_chars_base = consumed_chars;
1357
1358       if (charbuf + 2 >= charbuf_end)
1359         break;
1360
1361       ONE_MORE_BYTE (c1);
1362       ONE_MORE_BYTE (c2);
1363       c = (endian == utf_16_big_endian
1364            ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1365       if (surrogate)
1366         {
1367           if (! UTF_16_LOW_SURROGATE_P (c))
1368             {
1369               if (endian == utf_16_big_endian)
1370                 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1371               else
1372                 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1373               *charbuf++ = c1;
1374               *charbuf++ = c2;
1375               coding->errors++;
1376               if (UTF_16_HIGH_SURROGATE_P (c))
1377                 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1378               else
1379                 *charbuf++ = c;
1380             }
1381           else
1382             {
1383               c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1384               CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1385               *charbuf++ = c;
1386             }
1387         }
1388       else
1389         {
1390           if (UTF_16_HIGH_SURROGATE_P (c))
1391             CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1392           else
1393             *charbuf++ = c;
1394         }
1395     }
1396
1397  no_more_source:
1398   coding->consumed_char += consumed_chars_base;
1399   coding->consumed = src_base - coding->source;
1400   coding->charbuf_used = charbuf - coding->charbuf;
1401 }
1402
1403 static int
1404 encode_coding_utf_16 (coding)
1405      struct coding_system *coding;
1406 {
1407   int multibytep = coding->dst_multibyte;
1408   int *charbuf = coding->charbuf;
1409   int *charbuf_end = charbuf + coding->charbuf_used;
1410   unsigned char *dst = coding->destination + coding->produced;
1411   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1412   int safe_room = 8;
1413   enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1414   int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1415   int produced_chars = 0;
1416   Lisp_Object attrs, eol_type, charset_list;
1417   int c;
1418
1419   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1420
1421   if (bom == utf_16_with_bom)
1422     {
1423       ASSURE_DESTINATION (safe_room);
1424       if (big_endian)
1425         EMIT_TWO_BYTES (0xFF, 0xFE);
1426       else
1427         EMIT_TWO_BYTES (0xFE, 0xFF);
1428       CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1429     }
1430
1431   while (charbuf < charbuf_end)
1432     {
1433       ASSURE_DESTINATION (safe_room);
1434       c = *charbuf++;
1435       if (c >= MAX_UNICODE_CHAR)
1436         c = coding->default_char;
1437
1438       if (c < 0x10000)
1439         {
1440           if (big_endian)
1441             EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1442           else
1443             EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1444         }
1445       else
1446         {
1447           int c1, c2;
1448
1449           c -= 0x10000;
1450           c1 = (c >> 10) + 0xD800;
1451           c2 = (c & 0x3FF) + 0xDC00;
1452           if (big_endian)
1453             EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1454           else
1455             EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1456         }
1457     }
1458   coding->result = CODING_RESULT_SUCCESS;
1459   coding->produced = dst - coding->destination;
1460   coding->produced_char += produced_chars;
1461   return 0;
1462 }
1463
1464 \f
1465 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1466
1467 /* Emacs' internal format for representation of multiple character
1468    sets is a kind of multi-byte encoding, i.e. characters are
1469    represented by variable-length sequences of one-byte codes.
1470
1471    ASCII characters and control characters (e.g. `tab', `newline') are
1472    represented by one-byte sequences which are their ASCII codes, in
1473    the range 0x00 through 0x7F.
1474
1475    8-bit characters of the range 0x80..0x9F are represented by
1476    two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1477    code + 0x20).
1478
1479    8-bit characters of the range 0xA0..0xFF are represented by
1480    one-byte sequences which are their 8-bit code.
1481
1482    The other characters are represented by a sequence of `base
1483    leading-code', optional `extended leading-code', and one or two
1484    `position-code's.  The length of the sequence is determined by the
1485    base leading-code.  Leading-code takes the range 0x81 through 0x9D,
1486    whereas extended leading-code and position-code take the range 0xA0
1487    through 0xFF.  See `charset.h' for more details about leading-code
1488    and position-code.
1489
1490    --- CODE RANGE of Emacs' internal format ---
1491    character set        range
1492    -------------        -----
1493    ascii                0x00..0x7F
1494    eight-bit-control    LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1495    eight-bit-graphic    0xA0..0xBF
1496    ELSE                 0x81..0x9D + [0xA0..0xFF]+
1497    ---------------------------------------------
1498
1499    As this is the internal character representation, the format is
1500    usually not used externally (i.e. in a file or in a data sent to a
1501    process).  But, it is possible to have a text externally in this
1502    format (i.e. by encoding by the coding system `emacs-mule').
1503
1504    In that case, a sequence of one-byte codes has a slightly different
1505    form.
1506
1507    At first, all characters in eight-bit-control are represented by
1508    one-byte sequences which are their 8-bit code.
1509
1510    Next, character composition data are represented by the byte
1511    sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1512    where,
1513         METHOD is 0xF0 plus one of composition method (enum
1514         composition_method),
1515
1516         BYTES is 0xA0 plus a byte length of this composition data,
1517
1518         CHARS is 0x20 plus a number of characters composed by this
1519         data,
1520
1521         COMPONENTs are characters of multibye form or composition
1522         rules encoded by two-byte of ASCII codes.
1523
1524    In addition, for backward compatibility, the following formats are
1525    also recognized as composition data on decoding.
1526
1527    0x80 MSEQ ...
1528    0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1529
1530    Here,
1531         MSEQ is a multibyte form but in these special format:
1532           ASCII: 0xA0 ASCII_CODE+0x80,
1533           other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1534         RULE is a one byte code of the range 0xA0..0xF0 that
1535         represents a composition rule.
1536   */
1537
1538 char emacs_mule_bytes[256];
1539
1540 /* Leading-code followed by extended leading-code.  */
1541 #define LEADING_CODE_PRIVATE_11 0x9A /* for private DIMENSION1 of 1-column */
1542 #define LEADING_CODE_PRIVATE_12 0x9B /* for private DIMENSION1 of 2-column */
1543 #define LEADING_CODE_PRIVATE_21 0x9C /* for private DIMENSION2 of 1-column */
1544 #define LEADING_CODE_PRIVATE_22 0x9D /* for private DIMENSION2 of 2-column */
1545
1546
1547 int
1548 emacs_mule_char (coding, src, nbytes, nchars)
1549      struct coding_system *coding;
1550      unsigned char *src;
1551      int *nbytes, *nchars;
1552 {
1553   unsigned char *src_end = coding->source + coding->src_bytes;
1554   int multibytep = coding->src_multibyte;
1555   unsigned char *src_base = src;
1556   struct charset *charset;
1557   unsigned code;
1558   int c;
1559   int consumed_chars = 0;
1560
1561   ONE_MORE_BYTE (c);
1562   switch (emacs_mule_bytes[c])
1563     {
1564     case 2:
1565       if (! (charset = emacs_mule_charset[c]))
1566         goto invalid_code;
1567       ONE_MORE_BYTE (c);
1568       code = c & 0x7F;
1569       break;
1570
1571     case 3:
1572       if (c == LEADING_CODE_PRIVATE_11
1573           || c == LEADING_CODE_PRIVATE_12)
1574         {
1575           ONE_MORE_BYTE (c);
1576           if (! (charset = emacs_mule_charset[c]))
1577             goto invalid_code;
1578           ONE_MORE_BYTE (c);
1579           code = c & 0x7F;
1580         }
1581       else
1582         {
1583           if (! (charset = emacs_mule_charset[c]))
1584             goto invalid_code;
1585           ONE_MORE_BYTE (c);
1586           code = (c & 0x7F) << 8;
1587           ONE_MORE_BYTE (c);
1588           code |= c & 0x7F;
1589         }
1590       break;
1591
1592     case 4:
1593       ONE_MORE_BYTE (c);
1594       if (! (charset = emacs_mule_charset[c]))
1595         goto invalid_code;
1596       ONE_MORE_BYTE (c);
1597       code = (c & 0x7F) << 8;
1598       ONE_MORE_BYTE (c);
1599       code |= c & 0x7F;
1600       break;
1601
1602     case 1:
1603       code = c;
1604       charset = CHARSET_FROM_ID (ASCII_BYTE_P (code) ? charset_ascii
1605                                  : code < 0xA0 ? charset_8_bit_control
1606                                  : charset_8_bit_graphic);
1607       break;
1608
1609     default:
1610       abort ();
1611     }
1612   c = DECODE_CHAR (charset, code);
1613   if (c < 0)
1614     goto invalid_code;
1615   *nbytes = src - src_base;
1616   *nchars = consumed_chars;
1617   return c;
1618
1619  no_more_source:
1620   return -2;
1621
1622  invalid_code:
1623   return -1;
1624 }
1625
1626
1627 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1628    Check if a text is encoded in `emacs-mule'.  */
1629
1630 static int
1631 detect_coding_emacs_mule (coding, mask)
1632      struct coding_system *coding;
1633      int *mask;
1634 {
1635   unsigned char *src = coding->source, *src_base = src;
1636   unsigned char *src_end = coding->source + coding->src_bytes;
1637   int multibytep = coding->src_multibyte;
1638   int consumed_chars = 0;
1639   int c;
1640   int found = 0;
1641
1642   /* A coding system of this category is always ASCII compatible.  */
1643   src += coding->head_ascii;
1644
1645   while (1)
1646     {
1647       ONE_MORE_BYTE (c);
1648
1649       if (c == 0x80)
1650         {
1651           /* Perhaps the start of composite character.  We simple skip
1652              it because analyzing it is too heavy for detecting.  But,
1653              at least, we check that the composite character
1654              constitues of more than 4 bytes.  */
1655           unsigned char *src_base;
1656
1657         repeat:
1658           src_base = src;
1659           do
1660             {
1661               ONE_MORE_BYTE (c);
1662             }
1663           while (c >= 0xA0);
1664
1665           if (src - src_base <= 4)
1666             break;
1667           found = 1;
1668           if (c == 0x80)
1669             goto repeat;
1670         }
1671
1672       if (c < 0x80)
1673         {
1674           if (c < 0x20
1675               && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1676             break;
1677         }
1678       else
1679         {
1680           unsigned char *src_base = src - 1;
1681
1682           do
1683             {
1684               ONE_MORE_BYTE (c);
1685             }
1686           while (c >= 0xA0);
1687           if (src - src_base != emacs_mule_bytes[*src_base])
1688             break;
1689           found = 1;
1690         }
1691     }
1692   *mask &= ~CATEGORY_MASK_EMACS_MULE;
1693   return 0;
1694
1695  no_more_source:
1696   if (!found)
1697     return 0;
1698   *mask &= CATEGORY_MASK_EMACS_MULE;
1699   return 1;
1700 }
1701
1702
1703 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
1704
1705 /* Decode a character represented as a component of composition
1706    sequence of Emacs 20/21 style at SRC.  Set C to that character and
1707    update SRC to the head of next character (or an encoded composition
1708    rule).  If SRC doesn't points a composition component, set C to -1.
1709    If SRC points an invalid byte sequence, global exit by a return
1710    value 0.  */
1711
1712 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf)                 \
1713   if (1)                                                        \
1714     {                                                           \
1715       int c;                                                    \
1716       int nbytes, nchars;                                       \
1717                                                                 \
1718       if (src == src_end)                                       \
1719         break;                                                  \
1720       c = emacs_mule_char (coding, src, &nbytes, &nchars);      \
1721       if (c < 0)                                                \
1722         {                                                       \
1723           if (c == -2)                                          \
1724             break;                                              \
1725           goto invalid_code;                                    \
1726         }                                                       \
1727       *buf++ = c;                                               \
1728       src += nbytes;                                            \
1729       consumed_chars += nchars;                                 \
1730     }                                                           \
1731   else
1732
1733
1734 /* Decode a composition rule represented as a component of composition
1735    sequence of Emacs 20 style at SRC.  Store the decoded rule in *BUF,
1736    and increment BUF.  If SRC points an invalid byte sequence, set C
1737    to -1.  */
1738
1739 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf)      \
1740   do {                                                  \
1741     int c, gref, nref;                                  \
1742                                                         \
1743     if (src >= src_end)                                 \
1744       goto invalid_code;                                \
1745     ONE_MORE_BYTE_NO_CHECK (c);                         \
1746     c -= 0x20;                                          \
1747     if (c < 0 || c >= 81)                               \
1748       goto invalid_code;                                \
1749                                                         \
1750     gref = c / 9, nref = c % 9;                         \
1751     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1752   } while (0)
1753
1754
1755 /* Decode a composition rule represented as a component of composition
1756    sequence of Emacs 21 style at SRC.  Store the decoded rule in *BUF,
1757    and increment BUF.  If SRC points an invalid byte sequence, set C
1758    to -1.  */
1759
1760 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf)      \
1761   do {                                                  \
1762     int gref, nref;                                     \
1763                                                         \
1764     if (src + 1>= src_end)                              \
1765       goto invalid_code;                                \
1766     ONE_MORE_BYTE_NO_CHECK (gref);                      \
1767     gref -= 0x20;                                       \
1768     ONE_MORE_BYTE_NO_CHECK (nref);                      \
1769     nref -= 0x20;                                       \
1770     if (gref < 0 || gref >= 81                          \
1771         || nref < 0 || nref >= 81)                      \
1772       goto invalid_code;                                \
1773     *buf++ = COMPOSITION_ENCODE_RULE (gref, nref);      \
1774   } while (0)
1775
1776
1777 #define ADD_COMPOSITION_DATA(buf, method, nchars)       \
1778   do {                                                  \
1779     *buf++ = -5;                                        \
1780     *buf++ = coding->produced_char + char_offset;       \
1781     *buf++ = CODING_ANNOTATE_COMPOSITION_MASK;          \
1782     *buf++ = method;                                    \
1783     *buf++ = nchars;                                    \
1784   } while (0)
1785
1786
1787 #define DECODE_EMACS_MULE_21_COMPOSITION(c)                             \
1788   do {                                                                  \
1789     /* Emacs 21 style format.  The first three bytes at SRC are         \
1790        (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is  \
1791        the byte length of this composition information, CHARS is the    \
1792        number of characters composed by this composition.  */           \
1793     enum composition_method method = c - 0xF2;                          \
1794     int *charbuf_base = charbuf;                                        \
1795     int consumed_chars_limit;                                           \
1796     int nbytes, nchars;                                                 \
1797                                                                         \
1798     ONE_MORE_BYTE (c);                                                  \
1799     nbytes = c - 0xA0;                                                  \
1800     if (nbytes < 3)                                                     \
1801       goto invalid_code;                                                \
1802     ONE_MORE_BYTE (c);                                                  \
1803     nchars = c - 0xA0;                                                  \
1804     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
1805     consumed_chars_limit = consumed_chars_base + nbytes;                \
1806     if (method != COMPOSITION_RELATIVE)                                 \
1807       {                                                                 \
1808         int i = 0;                                                      \
1809         while (consumed_chars < consumed_chars_limit)                   \
1810           {                                                             \
1811             if (i % 2 && method != COMPOSITION_WITH_ALTCHARS)           \
1812               DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf);          \
1813             else                                                        \
1814               DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf);             \
1815             i++;                                                        \
1816           }                                                             \
1817         if (consumed_chars < consumed_chars_limit)                      \
1818           goto invalid_code;                                            \
1819         charbuf_base[0] -= i;                                           \
1820       }                                                                 \
1821   } while (0)
1822
1823
1824 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c)            \
1825   do {                                                          \
1826     /* Emacs 20 style format for relative composition.  */      \
1827     /* Store multibyte form of characters to be composed.  */   \
1828     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1829     int *buf = components;                                      \
1830     int i, j;                                                   \
1831                                                                 \
1832     src = src_base;                                             \
1833     ONE_MORE_BYTE (c);          /* skip 0x80 */                 \
1834     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1835       DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                 \
1836     if (i < 2)                                                  \
1837       goto invalid_code;                                        \
1838     ADD_COMPOSITION_DATA (charbuf, COMPOSITION_RELATIVE, i);    \
1839     for (j = 0; j < i; j++)                                     \
1840       *charbuf++ = components[j];                               \
1841   } while (0)
1842
1843
1844 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c)            \
1845   do {                                                          \
1846     /* Emacs 20 style format for rule-base composition.  */     \
1847     /* Store multibyte form of characters to be composed.  */   \
1848     int components[MAX_COMPOSITION_COMPONENTS * 2 - 1];         \
1849     int *buf = components;                                      \
1850     int i, j;                                                   \
1851                                                                 \
1852     DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);                   \
1853     for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++)            \
1854       {                                                         \
1855         DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf);            \
1856         DECODE_EMACS_MULE_COMPOSITION_CHAR (buf);               \
1857       }                                                         \
1858     if (i < 1 || (buf - components) % 2 == 0)                   \
1859       goto invalid_code;                                        \
1860     if (charbuf + i + (i / 2) + 1 < charbuf_end)                \
1861       goto no_more_source;                                      \
1862     ADD_COMPOSITION_DATA (buf, COMPOSITION_WITH_RULE, i);       \
1863     for (j = 0; j < i; j++)                                     \
1864       *charbuf++ = components[j];                               \
1865     for (j = 0; j < i; j += 2)                                  \
1866       *charbuf++ = components[j];                               \
1867   } while (0)
1868
1869
1870 static void
1871 decode_coding_emacs_mule (coding)
1872      struct coding_system *coding;
1873 {
1874   unsigned char *src = coding->source + coding->consumed;
1875   unsigned char *src_end = coding->source + coding->src_bytes;
1876   unsigned char *src_base;
1877   int *charbuf = coding->charbuf;
1878   int *charbuf_end = charbuf + coding->charbuf_size;
1879   int consumed_chars = 0, consumed_chars_base;
1880   int char_offset = 0;
1881   int multibytep = coding->src_multibyte;
1882   Lisp_Object attrs, eol_type, charset_list;
1883
1884   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1885
1886   while (1)
1887     {
1888       int c;
1889
1890       src_base = src;
1891       consumed_chars_base = consumed_chars;
1892
1893       if (charbuf >= charbuf_end)
1894         break;
1895
1896       ONE_MORE_BYTE (c);
1897
1898       if (c < 0x80)
1899         {
1900           if (c == '\r')
1901             {
1902               if (EQ (eol_type, Qdos))
1903                 {
1904                   if (src == src_end)
1905                     goto no_more_source;
1906                   if (*src == '\n')
1907                     ONE_MORE_BYTE (c);
1908                 }
1909               else if (EQ (eol_type, Qmac))
1910                 c = '\n';
1911             }
1912           *charbuf++ = c;
1913           char_offset++;
1914         }
1915       else if (c == 0x80)
1916         {
1917           if (charbuf + 5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 > charbuf_end)
1918             break;
1919           ONE_MORE_BYTE (c);
1920           if (c - 0xF2 >= COMPOSITION_RELATIVE
1921               && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1922             DECODE_EMACS_MULE_21_COMPOSITION (c);
1923           else if (c < 0xC0)
1924             DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1925           else if (c == 0xFF)
1926             DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1927           else
1928             goto invalid_code;
1929           coding->annotated = 1;
1930         }
1931       else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1932         {
1933           int nbytes, nchars;
1934           src = src_base;
1935           consumed_chars = consumed_chars_base;
1936           c = emacs_mule_char (coding, src, &nbytes, &nchars);
1937           if (c < 0)
1938             {
1939               if (c == -2)
1940                 break;
1941               goto invalid_code;
1942             }
1943           *charbuf++ = c;
1944           src += nbytes;
1945           consumed_chars += nchars;
1946           char_offset++;
1947         }
1948       continue;
1949
1950     invalid_code:
1951       src = src_base;
1952       consumed_chars = consumed_chars_base;
1953       ONE_MORE_BYTE (c);
1954       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1955       coding->errors++;
1956     }
1957
1958  no_more_source:
1959   coding->consumed_char += consumed_chars_base;
1960   coding->consumed = src_base - coding->source;
1961   coding->charbuf_used = charbuf - coding->charbuf;
1962 }
1963
1964
1965 #define EMACS_MULE_LEADING_CODES(id, codes)     \
1966   do {                                          \
1967     if (id < 0xA0)                              \
1968       codes[0] = id, codes[1] = 0;              \
1969     else if (id < 0xE0)                         \
1970       codes[0] = 0x9A, codes[1] = id;           \
1971     else if (id < 0xF0)                         \
1972       codes[0] = 0x9B, codes[1] = id;           \
1973     else if (id < 0xF5)                         \
1974       codes[0] = 0x9C, codes[1] = id;           \
1975     else                                        \
1976       codes[0] = 0x9D, codes[1] = id;           \
1977   } while (0);
1978
1979
1980 static int
1981 encode_coding_emacs_mule (coding)
1982      struct coding_system *coding;
1983 {
1984   int multibytep = coding->dst_multibyte;
1985   int *charbuf = coding->charbuf;
1986   int *charbuf_end = charbuf + coding->charbuf_used;
1987   unsigned char *dst = coding->destination + coding->produced;
1988   unsigned char *dst_end = coding->destination + coding->dst_bytes;
1989   int safe_room = 8;
1990   int produced_chars = 0;
1991   Lisp_Object attrs, eol_type, charset_list;
1992   int c;
1993
1994   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1995
1996   while (charbuf < charbuf_end)
1997     {
1998       ASSURE_DESTINATION (safe_room);
1999       c = *charbuf++;
2000       if (ASCII_CHAR_P (c))
2001         EMIT_ONE_ASCII_BYTE (c);
2002       else if (CHAR_BYTE8_P (c))
2003         {
2004           c = CHAR_TO_BYTE8 (c);
2005           EMIT_ONE_BYTE (c);
2006         }
2007       else
2008         {
2009           struct charset *charset;
2010           unsigned code;
2011           int dimension;
2012           int emacs_mule_id;
2013           unsigned char leading_codes[2];
2014
2015           charset = char_charset (c, charset_list, &code);
2016           if (! charset)
2017             {
2018               c = coding->default_char;
2019               if (ASCII_CHAR_P (c))
2020                 {
2021                   EMIT_ONE_ASCII_BYTE (c);
2022                   continue;
2023                 }
2024               charset = char_charset (c, charset_list, &code);
2025             }
2026           dimension = CHARSET_DIMENSION (charset);
2027           emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2028           EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2029           EMIT_ONE_BYTE (leading_codes[0]);
2030           if (leading_codes[1])
2031             EMIT_ONE_BYTE (leading_codes[1]);
2032           if (dimension == 1)
2033             EMIT_ONE_BYTE (code);
2034           else
2035             {
2036               EMIT_ONE_BYTE (code >> 8);
2037               EMIT_ONE_BYTE (code & 0xFF);
2038             }
2039         }
2040     }
2041   coding->result = CODING_RESULT_SUCCESS;
2042   coding->produced_char += produced_chars;
2043   coding->produced = dst - coding->destination;
2044   return 0;
2045 }
2046
2047 \f
2048 /*** 7. ISO2022 handlers ***/
2049
2050 /* The following note describes the coding system ISO2022 briefly.
2051    Since the intention of this note is to help understand the
2052    functions in this file, some parts are NOT ACCURATE or OVERLY
2053    SIMPLIFIED.  For thorough understanding, please refer to the
2054    original document of ISO2022.
2055
2056    ISO2022 provides many mechanisms to encode several character sets
2057    in 7-bit and 8-bit environments.  For 7-bite environments, all text
2058    is encoded using bytes less than 128.  This may make the encoded
2059    text a little bit longer, but the text passes more easily through
2060    several gateways, some of which strip off MSB (Most Signigant Bit).
2061
2062    There are two kinds of character sets: control character set and
2063    graphic character set.  The former contains control characters such
2064    as `newline' and `escape' to provide control functions (control
2065    functions are also provided by escape sequences).  The latter
2066    contains graphic characters such as 'A' and '-'.  Emacs recognizes
2067    two control character sets and many graphic character sets.
2068
2069    Graphic character sets are classified into one of the following
2070    four classes, according to the number of bytes (DIMENSION) and
2071    number of characters in one dimension (CHARS) of the set:
2072    - DIMENSION1_CHARS94
2073    - DIMENSION1_CHARS96
2074    - DIMENSION2_CHARS94
2075    - DIMENSION2_CHARS96
2076
2077    In addition, each character set is assigned an identification tag,
2078    unique for each set, called "final character" (denoted as <F>
2079    hereafter).  The <F> of each character set is decided by ECMA(*)
2080    when it is registered in ISO.  The code range of <F> is 0x30..0x7F
2081    (0x30..0x3F are for private use only).
2082
2083    Note (*): ECMA = European Computer Manufacturers Association
2084
2085    Here are examples of graphic character set [NAME(<F>)]:
2086         o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2087         o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2088         o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2089         o DIMENSION2_CHARS96 -- none for the moment
2090
2091    A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2092         C0 [0x00..0x1F] -- control character plane 0
2093         GL [0x20..0x7F] -- graphic character plane 0
2094         C1 [0x80..0x9F] -- control character plane 1
2095         GR [0xA0..0xFF] -- graphic character plane 1
2096
2097    A control character set is directly designated and invoked to C0 or
2098    C1 by an escape sequence.  The most common case is that:
2099    - ISO646's  control character set is designated/invoked to C0, and
2100    - ISO6429's control character set is designated/invoked to C1,
2101    and usually these designations/invocations are omitted in encoded
2102    text.  In a 7-bit environment, only C0 can be used, and a control
2103    character for C1 is encoded by an appropriate escape sequence to
2104    fit into the environment.  All control characters for C1 are
2105    defined to have corresponding escape sequences.
2106
2107    A graphic character set is at first designated to one of four
2108    graphic registers (G0 through G3), then these graphic registers are
2109    invoked to GL or GR.  These designations and invocations can be
2110    done independently.  The most common case is that G0 is invoked to
2111    GL, G1 is invoked to GR, and ASCII is designated to G0.  Usually
2112    these invocations and designations are omitted in encoded text.
2113    In a 7-bit environment, only GL can be used.
2114
2115    When a graphic character set of CHARS94 is invoked to GL, codes
2116    0x20 and 0x7F of the GL area work as control characters SPACE and
2117    DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2118    be used.
2119
2120    There are two ways of invocation: locking-shift and single-shift.
2121    With locking-shift, the invocation lasts until the next different
2122    invocation, whereas with single-shift, the invocation affects the
2123    following character only and doesn't affect the locking-shift
2124    state.  Invocations are done by the following control characters or
2125    escape sequences:
2126
2127    ----------------------------------------------------------------------
2128    abbrev  function                  cntrl escape seq   description
2129    ----------------------------------------------------------------------
2130    SI/LS0  (shift-in)                0x0F  none         invoke G0 into GL
2131    SO/LS1  (shift-out)               0x0E  none         invoke G1 into GL
2132    LS2     (locking-shift-2)         none  ESC 'n'      invoke G2 into GL
2133    LS3     (locking-shift-3)         none  ESC 'o'      invoke G3 into GL
2134    LS1R    (locking-shift-1 right)   none  ESC '~'      invoke G1 into GR (*)
2135    LS2R    (locking-shift-2 right)   none  ESC '}'      invoke G2 into GR (*)
2136    LS3R    (locking-shift 3 right)   none  ESC '|'      invoke G3 into GR (*)
2137    SS2     (single-shift-2)          0x8E  ESC 'N'      invoke G2 for one char
2138    SS3     (single-shift-3)          0x8F  ESC 'O'      invoke G3 for one char
2139    ----------------------------------------------------------------------
2140    (*) These are not used by any known coding system.
2141
2142    Control characters for these functions are defined by macros
2143    ISO_CODE_XXX in `coding.h'.
2144
2145    Designations are done by the following escape sequences:
2146    ----------------------------------------------------------------------
2147    escape sequence      description
2148    ----------------------------------------------------------------------
2149    ESC '(' <F>          designate DIMENSION1_CHARS94<F> to G0
2150    ESC ')' <F>          designate DIMENSION1_CHARS94<F> to G1
2151    ESC '*' <F>          designate DIMENSION1_CHARS94<F> to G2
2152    ESC '+' <F>          designate DIMENSION1_CHARS94<F> to G3
2153    ESC ',' <F>          designate DIMENSION1_CHARS96<F> to G0 (*)
2154    ESC '-' <F>          designate DIMENSION1_CHARS96<F> to G1
2155    ESC '.' <F>          designate DIMENSION1_CHARS96<F> to G2
2156    ESC '/' <F>          designate DIMENSION1_CHARS96<F> to G3
2157    ESC '$' '(' <F>      designate DIMENSION2_CHARS94<F> to G0 (**)
2158    ESC '$' ')' <F>      designate DIMENSION2_CHARS94<F> to G1
2159    ESC '$' '*' <F>      designate DIMENSION2_CHARS94<F> to G2
2160    ESC '$' '+' <F>      designate DIMENSION2_CHARS94<F> to G3
2161    ESC '$' ',' <F>      designate DIMENSION2_CHARS96<F> to G0 (*)
2162    ESC '$' '-' <F>      designate DIMENSION2_CHARS96<F> to G1
2163    ESC '$' '.' <F>      designate DIMENSION2_CHARS96<F> to G2
2164    ESC '$' '/' <F>      designate DIMENSION2_CHARS96<F> to G3
2165    ----------------------------------------------------------------------
2166
2167    In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2168    of dimension 1, chars 94, and final character <F>, etc...
2169
2170    Note (*): Although these designations are not allowed in ISO2022,
2171    Emacs accepts them on decoding, and produces them on encoding
2172    CHARS96 character sets in a coding system which is characterized as
2173    7-bit environment, non-locking-shift, and non-single-shift.
2174
2175    Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2176    '(' must be omitted.  We refer to this as "short-form" hereafter.
2177
2178    Now you may notice that there are a lot of ways for encoding the
2179    same multilingual text in ISO2022.  Actually, there exist many
2180    coding systems such as Compound Text (used in X11's inter client
2181    communication, ISO-2022-JP (used in Japanese internet), ISO-2022-KR
2182    (used in Korean internet), EUC (Extended UNIX Code, used in Asian
2183    localized platforms), and all of these are variants of ISO2022.
2184
2185    In addition to the above, Emacs handles two more kinds of escape
2186    sequences: ISO6429's direction specification and Emacs' private
2187    sequence for specifying character composition.
2188
2189    ISO6429's direction specification takes the following form:
2190         o CSI ']'      -- end of the current direction
2191         o CSI '0' ']'  -- end of the current direction
2192         o CSI '1' ']'  -- start of left-to-right text
2193         o CSI '2' ']'  -- start of right-to-left text
2194    The control character CSI (0x9B: control sequence introducer) is
2195    abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2196
2197    Character composition specification takes the following form:
2198         o ESC '0' -- start relative composition
2199         o ESC '1' -- end composition
2200         o ESC '2' -- start rule-base composition (*)
2201         o ESC '3' -- start relative composition with alternate chars  (**)
2202         o ESC '4' -- start rule-base composition with alternate chars  (**)
2203   Since these are not standard escape sequences of any ISO standard,
2204   the use of them for these meaning is restricted to Emacs only.
2205
2206   (*) This form is used only in Emacs 20.5 and the older versions,
2207   but the newer versions can safely decode it.
2208   (**) This form is used only in Emacs 21.1 and the newer versions,
2209   and the older versions can't decode it.
2210
2211   Here's a list of examples usages of these composition escape
2212   sequences (categorized by `enum composition_method').
2213
2214   COMPOSITION_RELATIVE:
2215         ESC 0 CHAR [ CHAR ] ESC 1
2216   COMPOSITOIN_WITH_RULE:
2217         ESC 2 CHAR [ RULE CHAR ] ESC 1
2218   COMPOSITION_WITH_ALTCHARS:
2219         ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2220   COMPOSITION_WITH_RULE_ALTCHARS:
2221         ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2222
2223 enum iso_code_class_type iso_code_class[256];
2224
2225 #define SAFE_CHARSET_P(coding, id)      \
2226   ((id) <= (coding)->max_charset_id     \
2227    && (coding)->safe_charsets[id] >= 0)
2228
2229
2230 #define SHIFT_OUT_OK(category)  \
2231   (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2232
2233 static void
2234 setup_iso_safe_charsets (attrs)
2235      Lisp_Object attrs;
2236 {
2237   Lisp_Object charset_list, safe_charsets;
2238   Lisp_Object request;
2239   Lisp_Object reg_usage;
2240   Lisp_Object tail;
2241   int reg94, reg96;
2242   int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2243   int max_charset_id;
2244
2245   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2246   if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2247       && ! EQ (charset_list, Viso_2022_charset_list))
2248     {
2249       CODING_ATTR_CHARSET_LIST (attrs)
2250         = charset_list = Viso_2022_charset_list;
2251       ASET (attrs, coding_attr_safe_charsets, Qnil);
2252     }
2253
2254   if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2255     return;
2256
2257   max_charset_id = 0;
2258   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2259     {
2260       int id = XINT (XCAR (tail));
2261       if (max_charset_id < id)
2262         max_charset_id = id;
2263     }
2264
2265   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2266                                 make_number (255));
2267   request = AREF (attrs, coding_attr_iso_request);
2268   reg_usage = AREF (attrs, coding_attr_iso_usage);
2269   reg94 = XINT (XCAR (reg_usage));
2270   reg96 = XINT (XCDR (reg_usage));
2271
2272   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2273     {
2274       Lisp_Object id;
2275       Lisp_Object reg;
2276       struct charset *charset;
2277
2278       id = XCAR (tail);
2279       charset = CHARSET_FROM_ID (XINT (id));
2280       reg = Fcdr (Fassq (id, request));
2281       if (! NILP (reg))
2282         XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2283       else if (charset->iso_chars_96)
2284         {
2285           if (reg96 < 4)
2286             XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2287         }
2288       else
2289         {
2290           if (reg94 < 4)
2291             XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2292         }
2293     }
2294   ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2295 }
2296
2297
2298 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2299    Check if a text is encoded in ISO2022.  If it is, returns an
2300    integer in which appropriate flag bits any of:
2301         CATEGORY_MASK_ISO_7
2302         CATEGORY_MASK_ISO_7_TIGHT
2303         CATEGORY_MASK_ISO_8_1
2304         CATEGORY_MASK_ISO_8_2
2305         CATEGORY_MASK_ISO_7_ELSE
2306         CATEGORY_MASK_ISO_8_ELSE
2307    are set.  If a code which should never appear in ISO2022 is found,
2308    returns 0.  */
2309
2310 static int
2311 detect_coding_iso_2022 (coding, mask)
2312      struct coding_system *coding;
2313      int *mask;
2314 {
2315   unsigned char *src = coding->source, *src_base = src;
2316   unsigned char *src_end = coding->source + coding->src_bytes;
2317   int multibytep = coding->src_multibyte;
2318   int mask_iso = CATEGORY_MASK_ISO;
2319   int mask_found = 0, mask_8bit_found = 0;
2320   int reg[4], shift_out = 0, single_shifting = 0;
2321   int id;
2322   int c, c1;
2323   int consumed_chars = 0;
2324   int i;
2325
2326   for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2327     {
2328       struct coding_system *this = &(coding_categories[i]);
2329       Lisp_Object attrs, val;
2330
2331       attrs = CODING_ID_ATTRS (this->id);
2332       if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2333           && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2334         setup_iso_safe_charsets (attrs);
2335       val = CODING_ATTR_SAFE_CHARSETS (attrs);
2336       this->max_charset_id = XSTRING (val)->size - 1;
2337       this->safe_charsets = (char *) XSTRING (val)->data;
2338     }
2339
2340   /* A coding system of this category is always ASCII compatible.  */
2341   src += coding->head_ascii;
2342
2343   reg[0] = charset_ascii, reg[1] = reg[2] = reg[3] = -1;
2344   while (mask_iso && src < src_end)
2345     {
2346       ONE_MORE_BYTE (c);
2347       switch (c)
2348         {
2349         case ISO_CODE_ESC:
2350           if (inhibit_iso_escape_detection)
2351             break;
2352           single_shifting = 0;
2353           ONE_MORE_BYTE (c);
2354           if (c >= '(' && c <= '/')
2355             {
2356               /* Designation sequence for a charset of dimension 1.  */
2357               ONE_MORE_BYTE (c1);
2358               if (c1 < ' ' || c1 >= 0x80
2359                   || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2360                 /* Invalid designation sequence.  Just ignore.  */
2361                 break;
2362               reg[(c - '(') % 4] = id;
2363             }
2364           else if (c == '$')
2365             {
2366               /* Designation sequence for a charset of dimension 2.  */
2367               ONE_MORE_BYTE (c);
2368               if (c >= '@' && c <= 'B')
2369                 /* Designation for JISX0208.1978, GB2312, or JISX0208.  */
2370                 reg[0] = id = iso_charset_table[1][0][c];
2371               else if (c >= '(' && c <= '/')
2372                 {
2373                   ONE_MORE_BYTE (c1);
2374                   if (c1 < ' ' || c1 >= 0x80
2375                       || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2376                     /* Invalid designation sequence.  Just ignore.  */
2377                     break;
2378                   reg[(c - '(') % 4] = id;
2379                 }
2380               else
2381                 /* Invalid designation sequence.  Just ignore.  */
2382                 break;
2383             }
2384           else if (c == 'N' || c == 'O')
2385             {
2386               /* ESC <Fe> for SS2 or SS3.  */
2387               mask_iso &= CATEGORY_MASK_ISO_7_ELSE;
2388               break;
2389             }
2390           else if (c >= '0' && c <= '4')
2391             {
2392               /* ESC <Fp> for start/end composition.  */
2393               mask_found |= CATEGORY_MASK_ISO;
2394               break;
2395             }
2396           else
2397             {
2398               /* Invalid escape sequence.  */
2399               mask_iso &= ~CATEGORY_MASK_ISO_ESCAPE;
2400               break;
2401             }
2402
2403           /* We found a valid designation sequence for CHARSET.  */
2404           mask_iso &= ~CATEGORY_MASK_ISO_8BIT;
2405           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2406                               id))
2407             mask_found |= CATEGORY_MASK_ISO_7;
2408           else
2409             mask_iso &= ~CATEGORY_MASK_ISO_7;
2410           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2411                               id))
2412             mask_found |= CATEGORY_MASK_ISO_7_TIGHT;
2413           else
2414             mask_iso &= ~CATEGORY_MASK_ISO_7_TIGHT;
2415           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2416                               id))
2417             mask_found |= CATEGORY_MASK_ISO_7_ELSE;
2418           else
2419             mask_iso &= ~CATEGORY_MASK_ISO_7_ELSE;
2420           if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2421                               id))
2422             mask_found |= CATEGORY_MASK_ISO_8_ELSE;
2423           else
2424             mask_iso &= ~CATEGORY_MASK_ISO_8_ELSE;
2425           break;
2426
2427         case ISO_CODE_SO:
2428           if (inhibit_iso_escape_detection)
2429             break;
2430           single_shifting = 0;
2431           if (shift_out == 0
2432               && (reg[1] >= 0
2433                   || SHIFT_OUT_OK (coding_category_iso_7_else)
2434                   || SHIFT_OUT_OK (coding_category_iso_8_else)))
2435             {
2436               /* Locking shift out.  */
2437               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2438               mask_found |= CATEGORY_MASK_ISO_ELSE;
2439             }
2440           break;
2441
2442         case ISO_CODE_SI:
2443           if (inhibit_iso_escape_detection)
2444             break;
2445           single_shifting = 0;
2446           if (shift_out == 1)
2447             {
2448               /* Locking shift in.  */
2449               mask_iso &= ~CATEGORY_MASK_ISO_7BIT;
2450               mask_found |= CATEGORY_MASK_ISO_ELSE;
2451             }
2452           break;
2453
2454         case ISO_CODE_CSI:
2455           single_shifting = 0;
2456         case ISO_CODE_SS2:
2457         case ISO_CODE_SS3:
2458           {
2459             int newmask = CATEGORY_MASK_ISO_8_ELSE;
2460
2461             if (inhibit_iso_escape_detection)
2462               break;
2463             if (c != ISO_CODE_CSI)
2464               {
2465                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2466                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2467                   newmask |= CATEGORY_MASK_ISO_8_1;
2468                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2469                     & CODING_ISO_FLAG_SINGLE_SHIFT)
2470                   newmask |= CATEGORY_MASK_ISO_8_2;
2471                 single_shifting = 1;
2472               }
2473             if (VECTORP (Vlatin_extra_code_table)
2474                 && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2475               {
2476                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2477                     & CODING_ISO_FLAG_LATIN_EXTRA)
2478                   newmask |= CATEGORY_MASK_ISO_8_1;
2479                 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2480                     & CODING_ISO_FLAG_LATIN_EXTRA)
2481                   newmask |= CATEGORY_MASK_ISO_8_2;
2482               }
2483             mask_iso &= newmask;
2484             mask_found |= newmask;
2485           }
2486           break;
2487
2488         default:
2489           if (c < 0x80)
2490             {
2491               single_shifting = 0;
2492               break;
2493             }
2494           else if (c < 0xA0)
2495             {
2496               single_shifting = 0;
2497               mask_8bit_found = 1;
2498               if (VECTORP (Vlatin_extra_code_table)
2499                   && !NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2500                 {
2501                   int newmask = 0;
2502
2503                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2504                       & CODING_ISO_FLAG_LATIN_EXTRA)
2505                     newmask |= CATEGORY_MASK_ISO_8_1;
2506                   if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2507                       & CODING_ISO_FLAG_LATIN_EXTRA)
2508                     newmask |= CATEGORY_MASK_ISO_8_2;
2509                   mask_iso &= newmask;
2510                   mask_found |= newmask;
2511                 }
2512               else
2513                 return 0;
2514             }
2515           else
2516             {
2517               mask_iso &= ~(CATEGORY_MASK_ISO_7BIT
2518                             | CATEGORY_MASK_ISO_7_ELSE);
2519               mask_found |= CATEGORY_MASK_ISO_8_1;
2520               mask_8bit_found = 1;
2521               /* Check the length of succeeding codes of the range
2522                  0xA0..0FF.  If the byte length is odd, we exclude
2523                  CATEGORY_MASK_ISO_8_2.  We can check this only
2524                  when we are not single shifting.  */
2525               if (!single_shifting
2526                   && mask_iso & CATEGORY_MASK_ISO_8_2)
2527                 {
2528                   int i = 1;
2529                   while (src < src_end)
2530                     {
2531                       ONE_MORE_BYTE (c);
2532                       if (c < 0xA0)
2533                         break;
2534                       i++;
2535                     }
2536
2537                   if (i & 1 && src < src_end)
2538                     mask_iso &= ~CATEGORY_MASK_ISO_8_2;
2539                   else
2540                     mask_found |= CATEGORY_MASK_ISO_8_2;
2541                 }
2542             }
2543           break;
2544         }
2545     }
2546  no_more_source:
2547   if (!mask_iso)
2548     {
2549       *mask &= ~CATEGORY_MASK_ISO;
2550       return 0;
2551     }
2552   if (!mask_found)
2553     return 0;
2554   *mask &= mask_iso & mask_found;
2555   if (! mask_8bit_found)
2556     *mask &= ~(CATEGORY_MASK_ISO_8BIT | CATEGORY_MASK_ISO_8_ELSE);
2557   return 1;
2558 }
2559
2560
2561 /* Set designation state into CODING.  */
2562 #define DECODE_DESIGNATION(reg, dim, chars_96, final)                   \
2563   do {                                                                  \
2564     int id, prev;                                                       \
2565                                                                         \
2566     if (final < '0' || final >= 128                                     \
2567         || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0)        \
2568         || !SAFE_CHARSET_P (coding, id))                                \
2569       {                                                                 \
2570         CODING_ISO_DESIGNATION (coding, reg) = -2;                      \
2571         goto invalid_code;                                              \
2572       }                                                                 \
2573     prev = CODING_ISO_DESIGNATION (coding, reg);                        \
2574     if (id == charset_jisx0201_roman)                                   \
2575       {                                                                 \
2576         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)      \
2577           id = charset_ascii;                                           \
2578       }                                                                 \
2579     else if (id == charset_jisx0208_1978)                               \
2580       {                                                                 \
2581         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)     \
2582           id = charset_jisx0208;                                        \
2583       }                                                                 \
2584     CODING_ISO_DESIGNATION (coding, reg) = id;                          \
2585     /* If there was an invalid designation to REG previously, and this  \
2586        designation is ASCII to REG, we should keep this designation     \
2587        sequence.  */                                                    \
2588     if (prev == -2 && id == charset_ascii)                              \
2589       goto invalid_code;                                                \
2590   } while (0)
2591
2592
2593 #define MAYBE_FINISH_COMPOSITION()                              \
2594   do {                                                          \
2595     int i;                                                      \
2596     if (composition_state == COMPOSING_NO)                      \
2597       break;                                                    \
2598     /* It is assured that we have enough room for producing     \
2599        characters stored in the table `components'.  */         \
2600     if (charbuf + component_idx > charbuf_end)                  \
2601       goto no_more_source;                                      \
2602     composition_state = COMPOSING_NO;                           \
2603     if (method == COMPOSITION_RELATIVE                          \
2604         || method == COMPOSITION_WITH_ALTCHARS)                 \
2605       {                                                         \
2606         for (i = 0; i < component_idx; i++)                     \
2607           *charbuf++ = components[i];                           \
2608         char_offset += component_idx;                           \
2609       }                                                         \
2610     else                                                        \
2611       {                                                         \
2612         for (i = 0; i < component_idx; i += 2)                  \
2613           *charbuf++ = components[i];                           \
2614         char_offset += (component_idx / 2) + 1;                 \
2615       }                                                         \
2616   } while (0)
2617
2618
2619 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2620    ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2621    ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2622    ESC 3 : altchar composition :  ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2623    ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2624   */
2625
2626 #define DECODE_COMPOSITION_START(c1)                                    \
2627   do {                                                                  \
2628     if (c1 == '0'                                                       \
2629         && composition_state == COMPOSING_COMPONENT_RULE)               \
2630       {                                                                 \
2631         component_len = component_idx;                                  \
2632         composition_state = COMPOSING_CHAR;                             \
2633       }                                                                 \
2634     else                                                                \
2635       {                                                                 \
2636         unsigned char *p;                                               \
2637                                                                         \
2638         MAYBE_FINISH_COMPOSITION ();                                    \
2639         if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end)         \
2640           goto no_more_source;                                          \
2641         for (p = src; p < src_end - 1; p++)                             \
2642           if (*p == ISO_CODE_ESC && p[1] == '1')                        \
2643             break;                                                      \
2644         if (p == src_end - 1)                                           \
2645           {                                                             \
2646             if (coding->mode & CODING_MODE_LAST_BLOCK)                  \
2647               goto invalid_code;                                        \
2648             goto no_more_source;                                        \
2649           }                                                             \
2650                                                                         \
2651         /* This is surely the start of a composition.  */               \
2652         method = (c1 == '0' ? COMPOSITION_RELATIVE                      \
2653                   : c1 == '2' ? COMPOSITION_WITH_RULE                   \
2654                   : c1 == '3' ? COMPOSITION_WITH_ALTCHARS               \
2655                   : COMPOSITION_WITH_RULE_ALTCHARS);                    \
2656         composition_state = (c1 <= '2' ? COMPOSING_CHAR                 \
2657                              : COMPOSING_COMPONENT_CHAR);               \
2658         component_idx = component_len = 0;                              \
2659       }                                                                 \
2660   } while (0)
2661
2662
2663 /* Handle compositoin end sequence ESC 1.  */
2664
2665 #define DECODE_COMPOSITION_END()                                        \
2666   do {                                                                  \
2667     int nchars = (component_len > 0 ? component_idx - component_len     \
2668                   : method == COMPOSITION_RELATIVE ? component_idx      \
2669                   : (component_idx + 1) / 2);                           \
2670     int i;                                                              \
2671     int *saved_charbuf = charbuf;                                       \
2672                                                                         \
2673     ADD_COMPOSITION_DATA (charbuf, method, nchars);                     \
2674     if (method != COMPOSITION_RELATIVE)                                 \
2675       {                                                                 \
2676         if (component_len == 0)                                         \
2677           for (i = 0; i < component_idx; i++)                           \
2678             *charbuf++ = components[i];                                 \
2679         else                                                            \
2680           for (i = 0; i < component_len; i++)                           \
2681             *charbuf++ = components[i];                                 \
2682         *saved_charbuf = saved_charbuf - charbuf;                       \
2683       }                                                                 \
2684     if (method == COMPOSITION_WITH_RULE)                                \
2685       for (i = 0; i < component_idx; i += 2, char_offset++)             \
2686         *charbuf++ = components[i];                                     \
2687     else                                                                \
2688       for (i = component_len; i < component_idx; i++, char_offset++)    \
2689         *charbuf++ = components[i];                                     \
2690     coding->annotated = 1;                                              \
2691     composition_state = COMPOSING_NO;                                   \
2692   } while (0)
2693
2694
2695 /* Decode a composition rule from the byte C1 (and maybe one more byte
2696    from SRC) and store one encoded composition rule in
2697    coding->cmp_data.  */
2698
2699 #define DECODE_COMPOSITION_RULE(c1)                                     \
2700   do {                                                                  \
2701     (c1) -= 32;                                                         \
2702     if (c1 < 81)                /* old format (before ver.21) */        \
2703       {                                                                 \
2704         int gref = (c1) / 9;                                            \
2705         int nref = (c1) % 9;                                            \
2706         if (gref == 4) gref = 10;                                       \
2707         if (nref == 4) nref = 10;                                       \
2708         c1 = COMPOSITION_ENCODE_RULE (gref, nref);                      \
2709       }                                                                 \
2710     else if (c1 < 93)           /* new format (after ver.21) */         \
2711       {                                                                 \
2712         ONE_MORE_BYTE (c2);                                             \
2713         c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32);                \
2714       }                                                                 \
2715     else                                                                \
2716       c1 = 0;                                                           \
2717   } while (0)
2718
2719
2720 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
2721
2722 static void
2723 decode_coding_iso_2022 (coding)
2724      struct coding_system *coding;
2725 {
2726   unsigned char *src = coding->source + coding->consumed;
2727   unsigned char *src_end = coding->source + coding->src_bytes;
2728   unsigned char *src_base;
2729   int *charbuf = coding->charbuf;
2730   int *charbuf_end = charbuf + coding->charbuf_size - 4;
2731   int consumed_chars = 0, consumed_chars_base;
2732   int char_offset = 0;
2733   int multibytep = coding->src_multibyte;
2734   /* Charsets invoked to graphic plane 0 and 1 respectively.  */
2735   int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2736   int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2737   struct charset *charset;
2738   int c;
2739   /* For handling composition sequence.  */
2740 #define COMPOSING_NO                    0
2741 #define COMPOSING_CHAR                  1
2742 #define COMPOSING_RULE                  2
2743 #define COMPOSING_COMPONENT_CHAR        3
2744 #define COMPOSING_COMPONENT_RULE        4
2745
2746   int composition_state = COMPOSING_NO;
2747   enum composition_method method;
2748   int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2749   int component_idx;
2750   int component_len;
2751   Lisp_Object attrs, eol_type, charset_list;
2752
2753   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2754   setup_iso_safe_charsets (attrs);
2755
2756   while (1)
2757     {
2758       int c1, c2;
2759
2760       src_base = src;
2761       consumed_chars_base = consumed_chars;
2762
2763       if (charbuf >= charbuf_end)
2764         break;
2765
2766       ONE_MORE_BYTE (c1);
2767
2768       /* We produce no character or one character.  */
2769       switch (iso_code_class [c1])
2770         {
2771         case ISO_0x20_or_0x7F:
2772           if (composition_state != COMPOSING_NO)
2773             {
2774               if (composition_state == COMPOSING_RULE
2775                   || composition_state == COMPOSING_COMPONENT_RULE)
2776                 {
2777                   DECODE_COMPOSITION_RULE (c1);
2778                   components[component_idx++] = c1;
2779                   composition_state--;
2780                   continue;
2781                 }
2782             }
2783           if (charset_id_0 < 0
2784               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2785             /* This is SPACE or DEL.  */
2786             charset = CHARSET_FROM_ID (charset_ascii);
2787           else
2788             charset = CHARSET_FROM_ID (charset_id_0);
2789           break;
2790
2791         case ISO_graphic_plane_0:
2792           if (composition_state != COMPOSING_NO)
2793             {
2794               if (composition_state == COMPOSING_RULE
2795                   || composition_state == COMPOSING_COMPONENT_RULE)
2796                 {
2797                   DECODE_COMPOSITION_RULE (c1);
2798                   components[component_idx++] = c1;
2799                   composition_state--;
2800                   continue;
2801                 }
2802             }
2803           charset = CHARSET_FROM_ID (charset_id_0);
2804           break;
2805
2806         case ISO_0xA0_or_0xFF:
2807           if (charset_id_1 < 0
2808               || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2809               || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2810             goto invalid_code;
2811           /* This is a graphic character, we fall down ... */
2812
2813         case ISO_graphic_plane_1:
2814           if (charset_id_1 < 0)
2815             goto invalid_code;
2816           charset = CHARSET_FROM_ID (charset_id_1);
2817           break;
2818
2819         case ISO_carriage_return:
2820           if (c1 == '\r')
2821             {
2822               if (EQ (eol_type, Qdos))
2823                 {
2824                   if (src == src_end)
2825                     goto no_more_source;
2826                   if (*src == '\n')
2827                     ONE_MORE_BYTE (c1);
2828                 }
2829               else if (EQ (eol_type, Qmac))
2830                 c1 = '\n';
2831             }
2832           /* fall through */
2833
2834         case ISO_control_0:
2835           MAYBE_FINISH_COMPOSITION ();
2836           charset = CHARSET_FROM_ID (charset_ascii);
2837           break;
2838
2839         case ISO_control_1:
2840           MAYBE_FINISH_COMPOSITION ();
2841           goto invalid_code;
2842
2843         case ISO_shift_out:
2844           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2845               || CODING_ISO_DESIGNATION (coding, 1) < 0)
2846             goto invalid_code;
2847           CODING_ISO_INVOCATION (coding, 0) = 1;
2848           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2849           continue;
2850
2851         case ISO_shift_in:
2852           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2853             goto invalid_code;
2854           CODING_ISO_INVOCATION (coding, 0) = 0;
2855           charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2856           continue;
2857
2858         case ISO_single_shift_2_7:
2859         case ISO_single_shift_2:
2860           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2861             goto invalid_code;
2862           /* SS2 is handled as an escape sequence of ESC 'N' */
2863           c1 = 'N';
2864           goto label_escape_sequence;
2865
2866         case ISO_single_shift_3:
2867           if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2868             goto invalid_code;
2869           /* SS2 is handled as an escape sequence of ESC 'O' */
2870           c1 = 'O';
2871           goto label_escape_sequence;
2872
2873         case ISO_control_sequence_introducer:
2874           /* CSI is handled as an escape sequence of ESC '[' ...  */
2875           c1 = '[';
2876           goto label_escape_sequence;
2877
2878         case ISO_escape:
2879           ONE_MORE_BYTE (c1);
2880         label_escape_sequence:
2881           /* Escape sequences handled here are invocation,
2882              designation, direction specification, and character
2883              composition specification.  */
2884           switch (c1)
2885             {
2886             case '&':           /* revision of following character set */
2887               ONE_MORE_BYTE (c1);
2888               if (!(c1 >= '@' && c1 <= '~'))
2889                 goto invalid_code;
2890               ONE_MORE_BYTE (c1);
2891               if (c1 != ISO_CODE_ESC)
2892                 goto invalid_code;
2893               ONE_MORE_BYTE (c1);
2894               goto label_escape_sequence;
2895
2896             case '$':           /* designation of 2-byte character set */
2897               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2898                 goto invalid_code;
2899               ONE_MORE_BYTE (c1);
2900               if (c1 >= '@' && c1 <= 'B')
2901                 {       /* designation of JISX0208.1978, GB2312.1980,
2902                            or JISX0208.1980 */
2903                   DECODE_DESIGNATION (0, 2, 0, c1);
2904                 }
2905               else if (c1 >= 0x28 && c1 <= 0x2B)
2906                 {       /* designation of DIMENSION2_CHARS94 character set */
2907                   ONE_MORE_BYTE (c2);
2908                   DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2909                 }
2910               else if (c1 >= 0x2C && c1 <= 0x2F)
2911                 {       /* designation of DIMENSION2_CHARS96 character set */
2912                   ONE_MORE_BYTE (c2);
2913                   DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2914                 }
2915               else
2916                 goto invalid_code;
2917               /* We must update these variables now.  */
2918               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2919               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2920               continue;
2921
2922             case 'n':           /* invocation of locking-shift-2 */
2923               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2924                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2925                 goto invalid_code;
2926               CODING_ISO_INVOCATION (coding, 0) = 2;
2927               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2928               continue;
2929
2930             case 'o':           /* invocation of locking-shift-3 */
2931               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2932                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2933                 goto invalid_code;
2934               CODING_ISO_INVOCATION (coding, 0) = 3;
2935               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2936               continue;
2937
2938             case 'N':           /* invocation of single-shift-2 */
2939               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2940                   || CODING_ISO_DESIGNATION (coding, 2) < 0)
2941                 goto invalid_code;
2942               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
2943               ONE_MORE_BYTE (c1);
2944               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2945                 goto invalid_code;
2946               break;
2947
2948             case 'O':           /* invocation of single-shift-3 */
2949               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
2950                   || CODING_ISO_DESIGNATION (coding, 3) < 0)
2951                 goto invalid_code;
2952               charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
2953               ONE_MORE_BYTE (c1);
2954               if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
2955                 goto invalid_code;
2956               break;
2957
2958             case '0': case '2': case '3': case '4': /* start composition */
2959               if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
2960                 goto invalid_code;
2961               DECODE_COMPOSITION_START (c1);
2962               continue;
2963
2964             case '1':           /* end composition */
2965               if (composition_state == COMPOSING_NO)
2966                 goto invalid_code;
2967               DECODE_COMPOSITION_END ();
2968               continue;
2969
2970             case '[':           /* specification of direction */
2971               if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
2972                 goto invalid_code;
2973               /* For the moment, nested direction is not supported.
2974                  So, `coding->mode & CODING_MODE_DIRECTION' zero means
2975                  left-to-right, and nozero means right-to-left.  */
2976               ONE_MORE_BYTE (c1);
2977               switch (c1)
2978                 {
2979                 case ']':       /* end of the current direction */
2980                   coding->mode &= ~CODING_MODE_DIRECTION;
2981
2982                 case '0':       /* end of the current direction */
2983                 case '1':       /* start of left-to-right direction */
2984                   ONE_MORE_BYTE (c1);
2985                   if (c1 == ']')
2986                     coding->mode &= ~CODING_MODE_DIRECTION;
2987                   else
2988                     goto invalid_code;
2989                   break;
2990
2991                 case '2':       /* start of right-to-left direction */
2992                   ONE_MORE_BYTE (c1);
2993                   if (c1 == ']')
2994                     coding->mode |= CODING_MODE_DIRECTION;
2995                   else
2996                     goto invalid_code;
2997                   break;
2998
2999                 default:
3000                   goto invalid_code;
3001                 }
3002               continue;
3003
3004             default:
3005               if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3006                 goto invalid_code;
3007               if (c1 >= 0x28 && c1 <= 0x2B)
3008                 {       /* designation of DIMENSION1_CHARS94 character set */
3009                   ONE_MORE_BYTE (c2);
3010                   DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3011                 }
3012               else if (c1 >= 0x2C && c1 <= 0x2F)
3013                 {       /* designation of DIMENSION1_CHARS96 character set */
3014                   ONE_MORE_BYTE (c2);
3015                   DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3016                 }
3017               else
3018                 goto invalid_code;
3019               /* We must update these variables now.  */
3020               charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3021               charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3022               continue;
3023             }
3024         }
3025
3026       /* Now we know CHARSET and 1st position code C1 of a character.
3027          Produce a decoded character while getting 2nd position code
3028          C2 if necessary.  */
3029       c1 &= 0x7F;
3030       if (CHARSET_DIMENSION (charset) > 1)
3031         {
3032           ONE_MORE_BYTE (c2);
3033           if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3034             /* C2 is not in a valid range.  */
3035             goto invalid_code;
3036           c1 = (c1 << 8) | (c2 & 0x7F);
3037           if (CHARSET_DIMENSION (charset) > 2)
3038             {
3039               ONE_MORE_BYTE (c2);
3040               if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3041                 /* C2 is not in a valid range.  */
3042                 goto invalid_code;
3043               c1 = (c1 << 8) | (c2 & 0x7F);
3044             }
3045         }
3046
3047       CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3048       if (c < 0)
3049         {
3050           MAYBE_FINISH_COMPOSITION ();
3051           for (; src_base < src; src_base++, char_offset++)
3052             {
3053               if (ASCII_BYTE_P (*src_base))
3054                 *charbuf++ = *src_base;
3055               else
3056                 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3057             }
3058         }
3059       else if (composition_state == COMPOSING_NO)
3060         {
3061           *charbuf++ = c;
3062           char_offset++;
3063         }
3064       else
3065         {
3066           components[component_idx++] = c;
3067           if (method == COMPOSITION_WITH_RULE
3068               || (method == COMPOSITION_WITH_RULE_ALTCHARS
3069                   && composition_state == COMPOSING_COMPONENT_CHAR))
3070             composition_state++;
3071         }
3072       continue;
3073
3074     invalid_code:
3075       MAYBE_FINISH_COMPOSITION ();
3076       src = src_base;
3077       consumed_chars = consumed_chars_base;
3078       ONE_MORE_BYTE (c);
3079       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3080       coding->errors++;
3081     }
3082
3083  no_more_source:
3084   coding->consumed_char += consumed_chars_base;
3085   coding->consumed = src_base - coding->source;
3086   coding->charbuf_used = charbuf - coding->charbuf;
3087 }
3088
3089
3090 /* ISO2022 encoding stuff.  */
3091
3092 /*
3093    It is not enough to say just "ISO2022" on encoding, we have to
3094    specify more details.  In Emacs, each coding system of ISO2022
3095    variant has the following specifications:
3096         1. Initial designation to G0 thru G3.
3097         2. Allows short-form designation?
3098         3. ASCII should be designated to G0 before control characters?
3099         4. ASCII should be designated to G0 at end of line?
3100         5. 7-bit environment or 8-bit environment?
3101         6. Use locking-shift?
3102         7. Use Single-shift?
3103    And the following two are only for Japanese:
3104         8. Use ASCII in place of JIS0201-1976-Roman?
3105         9. Use JISX0208-1983 in place of JISX0208-1978?
3106    These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3107    defined by macros CODING_ISO_FLAG_XXX.  See `coding.h' for more
3108    details.
3109 */
3110
3111 /* Produce codes (escape sequence) for designating CHARSET to graphic
3112    register REG at DST, and increment DST.  If <final-char> of CHARSET is
3113    '@', 'A', or 'B' and the coding system CODING allows, produce
3114    designation sequence of short-form.  */
3115
3116 #define ENCODE_DESIGNATION(charset, reg, coding)                        \
3117   do {                                                                  \
3118     unsigned char final_char = CHARSET_ISO_FINAL (charset);             \
3119     char *intermediate_char_94 = "()*+";                                \
3120     char *intermediate_char_96 = ",-./";                                \
3121     int revision = -1;                                                  \
3122     int c;                                                              \
3123                                                                         \
3124     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION)           \
3125       revision = XINT (CHARSET_ISO_REVISION (charset));                 \
3126                                                                         \
3127     if (revision >= 0)                                                  \
3128       {                                                                 \
3129         EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&');                       \
3130         EMIT_ONE_BYTE ('@' + revision);                                 \
3131       }                                                                 \
3132     EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC);                                 \
3133     if (CHARSET_DIMENSION (charset) == 1)                               \
3134       {                                                                 \
3135         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3136           c = intermediate_char_94[reg];                                \
3137         else                                                            \
3138           c = intermediate_char_96[reg];                                \
3139         EMIT_ONE_ASCII_BYTE (c);                                        \
3140       }                                                                 \
3141     else                                                                \
3142       {                                                                 \
3143         EMIT_ONE_ASCII_BYTE ('$');                                      \
3144         if (! CHARSET_ISO_CHARS_96 (charset))                           \
3145           {                                                             \
3146             if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM   \
3147                 || reg != 0                                             \
3148                 || final_char < '@' || final_char > 'B')                \
3149               EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]);          \
3150           }                                                             \
3151         else                                                            \
3152           EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]);              \
3153       }                                                                 \
3154     EMIT_ONE_ASCII_BYTE (final_char);                                   \
3155                                                                         \
3156     CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset);        \
3157   } while (0)
3158
3159
3160 /* The following two macros produce codes (control character or escape
3161    sequence) for ISO2022 single-shift functions (single-shift-2 and
3162    single-shift-3).  */
3163
3164 #define ENCODE_SINGLE_SHIFT_2                                           \
3165   do {                                                                  \
3166     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3167       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N');                         \
3168     else                                                                \
3169       EMIT_ONE_BYTE (ISO_CODE_SS2);                                     \
3170     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3171   } while (0)
3172
3173
3174 #define ENCODE_SINGLE_SHIFT_3                                           \
3175   do {                                                                  \
3176     if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)         \
3177       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O');                         \
3178     else                                                                \
3179       EMIT_ONE_BYTE (ISO_CODE_SS3);                                     \
3180     CODING_ISO_SINGLE_SHIFTING (coding) = 1;                            \
3181   } while (0)
3182
3183
3184 /* The following four macros produce codes (control character or
3185    escape sequence) for ISO2022 locking-shift functions (shift-in,
3186    shift-out, locking-shift-2, and locking-shift-3).  */
3187
3188 #define ENCODE_SHIFT_IN                                 \
3189   do {                                                  \
3190     EMIT_ONE_ASCII_BYTE (ISO_CODE_SI);                  \
3191     CODING_ISO_INVOCATION (coding, 0) = 0;              \
3192   } while (0)
3193
3194
3195 #define ENCODE_SHIFT_OUT                                \
3196   do {                                                  \
3197     EMIT_ONE_ASCII_BYTE (ISO_CODE_SO);                  \
3198     CODING_ISO_INVOCATION (coding, 0) = 1;              \
3199   } while (0)
3200
3201
3202 #define ENCODE_LOCKING_SHIFT_2                          \
3203   do {                                                  \
3204     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3205     CODING_ISO_INVOCATION (coding, 0) = 2;              \
3206   } while (0)
3207
3208
3209 #define ENCODE_LOCKING_SHIFT_3                          \
3210   do {                                                  \
3211     EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n');           \
3212     CODING_ISO_INVOCATION (coding, 0) = 3;              \
3213   } while (0)
3214
3215
3216 /* Produce codes for a DIMENSION1 character whose character set is
3217    CHARSET and whose position-code is C1.  Designation and invocation
3218    sequences are also produced in advance if necessary.  */
3219
3220 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1)                    \
3221   do {                                                                  \
3222     int id = CHARSET_ID (charset);                                      \
3223                                                                         \
3224     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN)         \
3225         && id == charset_ascii)                                         \
3226       {                                                                 \
3227         id = charset_jisx0201_roman;                                    \
3228         charset = CHARSET_FROM_ID (id);                                 \
3229       }                                                                 \
3230                                                                         \
3231     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3232       {                                                                 \
3233         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3234           EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                              \
3235         else                                                            \
3236           EMIT_ONE_BYTE (c1 | 0x80);                                    \
3237         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3238         break;                                                          \
3239       }                                                                 \
3240     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3241       {                                                                 \
3242         EMIT_ONE_ASCII_BYTE (c1 & 0x7F);                                \
3243         break;                                                          \
3244       }                                                                 \
3245     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3246       {                                                                 \
3247         EMIT_ONE_BYTE (c1 | 0x80);                                      \
3248         break;                                                          \
3249       }                                                                 \
3250     else                                                                \
3251       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3252          must invoke it, or, at first, designate it to some graphic     \
3253          register.  Then repeat the loop to actually produce the        \
3254          character.  */                                                 \
3255       dst = encode_invocation_designation (charset, coding, dst,        \
3256                                            &produced_chars);            \
3257   } while (1)
3258
3259
3260 /* Produce codes for a DIMENSION2 character whose character set is
3261    CHARSET and whose position-codes are C1 and C2.  Designation and
3262    invocation codes are also produced in advance if necessary.  */
3263
3264 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2)                \
3265   do {                                                                  \
3266     int id = CHARSET_ID (charset);                                      \
3267                                                                         \
3268     if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS)        \
3269         && id == charset_jisx0208)                                      \
3270       {                                                                 \
3271         id = charset_jisx0208_1978;                                     \
3272         charset = CHARSET_FROM_ID (id);                                 \
3273       }                                                                 \
3274                                                                         \
3275     if (CODING_ISO_SINGLE_SHIFTING (coding))                            \
3276       {                                                                 \
3277         if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)     \
3278           EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);              \
3279         else                                                            \
3280           EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                    \
3281         CODING_ISO_SINGLE_SHIFTING (coding) = 0;                        \
3282         break;                                                          \
3283       }                                                                 \
3284     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0))              \
3285       {                                                                 \
3286         EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F);                \
3287         break;                                                          \
3288       }                                                                 \
3289     else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1))              \
3290       {                                                                 \
3291         EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80);                      \
3292         break;                                                          \
3293       }                                                                 \
3294     else                                                                \
3295       /* Since CHARSET is not yet invoked to any graphic planes, we     \
3296          must invoke it, or, at first, designate it to some graphic     \
3297          register.  Then repeat the loop to actually produce the        \
3298          character.  */                                                 \
3299       dst = encode_invocation_designation (charset, coding, dst,        \
3300                                            &produced_chars);            \
3301   } while (1)
3302
3303
3304 #define ENCODE_ISO_CHARACTER(charset, c)                                   \
3305   do {                                                                     \
3306     int code = ENCODE_CHAR ((charset),(c));                                \
3307                                                                            \
3308     if (CHARSET_DIMENSION (charset) == 1)                                  \
3309       ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code);                   \
3310     else                                                                   \
3311       ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3312   } while (0)
3313
3314
3315 /* Produce designation and invocation codes at a place pointed by DST
3316    to use CHARSET.  The element `spec.iso_2022' of *CODING is updated.
3317    Return new DST.  */
3318
3319 unsigned char *
3320 encode_invocation_designation (charset, coding, dst, p_nchars)
3321      struct charset *charset;
3322      struct coding_system *coding;
3323      unsigned char *dst;
3324      int *p_nchars;
3325 {
3326   int multibytep = coding->dst_multibyte;
3327   int produced_chars = *p_nchars;
3328   int reg;                      /* graphic register number */
3329   int id = CHARSET_ID (charset);
3330
3331   /* At first, check designations.  */
3332   for (reg = 0; reg < 4; reg++)
3333     if (id == CODING_ISO_DESIGNATION (coding, reg))
3334       break;
3335
3336   if (reg >= 4)
3337     {
3338       /* CHARSET is not yet designated to any graphic registers.  */
3339       /* At first check the requested designation.  */
3340       reg = CODING_ISO_REQUEST (coding, id);
3341       if (reg < 0)
3342         /* Since CHARSET requests no special designation, designate it
3343            to graphic register 0.  */
3344         reg = 0;
3345
3346       ENCODE_DESIGNATION (charset, reg, coding);
3347     }
3348
3349   if (CODING_ISO_INVOCATION (coding, 0) != reg
3350       && CODING_ISO_INVOCATION (coding, 1) != reg)
3351     {
3352       /* Since the graphic register REG is not invoked to any graphic
3353          planes, invoke it to graphic plane 0.  */
3354       switch (reg)
3355         {
3356         case 0:                 /* graphic register 0 */
3357           ENCODE_SHIFT_IN;
3358           break;
3359
3360         case 1:                 /* graphic register 1 */
3361           ENCODE_SHIFT_OUT;
3362           break;
3363
3364         case 2:                 /* graphic register 2 */
3365           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3366             ENCODE_SINGLE_SHIFT_2;
3367           else
3368             ENCODE_LOCKING_SHIFT_2;
3369           break;
3370
3371         case 3:                 /* graphic register 3 */
3372           if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3373             ENCODE_SINGLE_SHIFT_3;
3374           else
3375             ENCODE_LOCKING_SHIFT_3;
3376           break;
3377         }
3378     }
3379
3380   *p_nchars = produced_chars;
3381   return dst;
3382 }
3383
3384 /* The following three macros produce codes for indicating direction
3385    of text.  */
3386 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER                              \
3387   do {                                                                  \
3388     if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS)        \
3389       EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '[');                         \
3390     else                                                                \
3391       EMIT_ONE_BYTE (ISO_CODE_CSI);                                     \
3392   } while (0)
3393
3394
3395 #define ENCODE_DIRECTION_R2L()                  \
3396   do {                                          \
3397     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3398     EMIT_TWO_ASCII_BYTES ('2', ']');            \
3399   } while (0)
3400
3401
3402 #define ENCODE_DIRECTION_L2R()                  \
3403   do {                                          \
3404     ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst);   \
3405     EMIT_TWO_ASCII_BYTES ('0', ']');            \
3406   } while (0)
3407
3408
3409 /* Produce codes for designation and invocation to reset the graphic
3410    planes and registers to initial state.  */
3411 #define ENCODE_RESET_PLANE_AND_REGISTER()                               \
3412   do {                                                                  \
3413     int reg;                                                            \
3414     struct charset *charset;                                            \
3415                                                                         \
3416     if (CODING_ISO_INVOCATION (coding, 0) != 0)                         \
3417       ENCODE_SHIFT_IN;                                                  \
3418     for (reg = 0; reg < 4; reg++)                                       \
3419       if (CODING_ISO_INITIAL (coding, reg) >= 0                         \
3420           && (CODING_ISO_DESIGNATION (coding, reg)                      \
3421               != CODING_ISO_INITIAL (coding, reg)))                     \
3422         {                                                               \
3423           charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3424           ENCODE_DESIGNATION (charset, reg, coding);                    \
3425         }                                                               \
3426   } while (0)
3427
3428
3429 /* Produce designation sequences of charsets in the line started from
3430    SRC to a place pointed by DST, and return updated DST.
3431
3432    If the current block ends before any end-of-line, we may fail to
3433    find all the necessary designations.  */
3434
3435 static unsigned char *
3436 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3437      struct coding_system *coding;
3438      int *charbuf, *charbuf_end;
3439      unsigned char *dst;
3440 {
3441   struct charset *charset;
3442   /* Table of charsets to be designated to each graphic register.  */
3443   int r[4];
3444   int c, found = 0, reg;
3445   int produced_chars = 0;
3446   int multibytep = coding->dst_multibyte;
3447   Lisp_Object attrs;
3448   Lisp_Object charset_list;
3449
3450   attrs = CODING_ID_ATTRS (coding->id);
3451   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3452   if (EQ (charset_list, Qiso_2022))
3453     charset_list = Viso_2022_charset_list;
3454
3455   for (reg = 0; reg < 4; reg++)
3456     r[reg] = -1;
3457
3458   while (found < 4)
3459     {
3460       int id;
3461
3462       c = *charbuf++;
3463       if (c == '\n')
3464         break;
3465       charset = char_charset (c, charset_list, NULL);
3466       id = CHARSET_ID (charset);
3467       reg = CODING_ISO_REQUEST (coding, id);
3468       if (reg >= 0 && r[reg] < 0)
3469         {
3470           found++;
3471           r[reg] = id;
3472         }
3473     }
3474
3475   if (found)
3476     {
3477       for (reg = 0; reg < 4; reg++)
3478         if (r[reg] >= 0
3479             && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3480           ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3481     }
3482
3483   return dst;
3484 }
3485
3486 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".  */
3487
3488 static int
3489 encode_coding_iso_2022 (coding)
3490      struct coding_system *coding;
3491 {
3492   int multibytep = coding->dst_multibyte;
3493   int *charbuf = coding->charbuf;
3494   int *charbuf_end = charbuf + coding->charbuf_used;
3495   unsigned char *dst = coding->destination + coding->produced;
3496   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3497   int safe_room = 16;
3498   int bol_designation
3499     = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3500        && CODING_ISO_BOL (coding));
3501   int produced_chars = 0;
3502   Lisp_Object attrs, eol_type, charset_list;
3503   int ascii_compatible;
3504   int c;
3505
3506   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3507   setup_iso_safe_charsets (attrs);
3508   coding->safe_charsets
3509     = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3510
3511   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3512
3513   while (charbuf < charbuf_end)
3514     {
3515       ASSURE_DESTINATION (safe_room);
3516
3517       if (bol_designation)
3518         {
3519           unsigned char *dst_prev = dst;
3520
3521           /* We have to produce designation sequences if any now.  */
3522           dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3523           bol_designation = 0;
3524           /* We are sure that designation sequences are all ASCII bytes.  */
3525           produced_chars += dst - dst_prev;
3526         }
3527
3528       c = *charbuf++;
3529
3530       /* Now encode the character C.  */
3531       if (c < 0x20 || c == 0x7F)
3532         {
3533           if (c == '\n'
3534               || (c == '\r' && EQ (eol_type, Qmac)))
3535             {
3536               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3537                 ENCODE_RESET_PLANE_AND_REGISTER ();
3538               if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3539                 {
3540                   int i;
3541
3542                   for (i = 0; i < 4; i++)
3543                     CODING_ISO_DESIGNATION (coding, i)
3544                       = CODING_ISO_INITIAL (coding, i);
3545                 }
3546               bol_designation
3547                 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3548             }
3549           else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3550             ENCODE_RESET_PLANE_AND_REGISTER ();
3551           EMIT_ONE_ASCII_BYTE (c);
3552         }
3553       else if (ASCII_CHAR_P (c))
3554         {
3555           if (ascii_compatible)
3556             EMIT_ONE_ASCII_BYTE (c);
3557           else
3558             {
3559               struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3560               ENCODE_ISO_CHARACTER (charset, c);
3561             }
3562         }
3563       else if (CHAR_BYTE8_P (c))
3564         {
3565           c = CHAR_TO_BYTE8 (c);
3566           EMIT_ONE_BYTE (c);
3567         }
3568       else
3569         {
3570           struct charset *charset = char_charset (c, charset_list, NULL);
3571
3572           if (!charset)
3573             {
3574               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3575                 {
3576                   c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3577                   charset = CHARSET_FROM_ID (charset_ascii);
3578                 }
3579               else
3580                 {
3581                   c = coding->default_char;
3582                   charset = char_charset (c, charset_list, NULL);
3583                 }
3584             }
3585           ENCODE_ISO_CHARACTER (charset, c);
3586         }
3587     }
3588
3589   if (coding->mode & CODING_MODE_LAST_BLOCK
3590       && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3591     {
3592       ASSURE_DESTINATION (safe_room);
3593       ENCODE_RESET_PLANE_AND_REGISTER ();
3594     }
3595   coding->result = CODING_RESULT_SUCCESS;
3596   CODING_ISO_BOL (coding) = bol_designation;
3597   coding->produced_char += produced_chars;
3598   coding->produced = dst - coding->destination;
3599   return 0;
3600 }
3601
3602 \f
3603 /*** 8,9. SJIS and BIG5 handlers ***/
3604
3605 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3606    quite widely.  So, for the moment, Emacs supports them in the bare
3607    C code.  But, in the future, they may be supported only by CCL.  */
3608
3609 /* SJIS is a coding system encoding three character sets: ASCII, right
3610    half of JISX0201-Kana, and JISX0208.  An ASCII character is encoded
3611    as is.  A character of charset katakana-jisx0201 is encoded by
3612    "position-code + 0x80".  A character of charset japanese-jisx0208
3613    is encoded in 2-byte but two position-codes are divided and shifted
3614    so that it fit in the range below.
3615
3616    --- CODE RANGE of SJIS ---
3617    (character set)      (range)
3618    ASCII                0x00 .. 0x7F
3619    KATAKANA-JISX0201    0xA0 .. 0xDF
3620    JISX0208 (1st byte)  0x81 .. 0x9F and 0xE0 .. 0xEF
3621             (2nd byte)  0x40 .. 0x7E and 0x80 .. 0xFC
3622    -------------------------------
3623
3624 */
3625
3626 /* BIG5 is a coding system encoding two character sets: ASCII and
3627    Big5.  An ASCII character is encoded as is.  Big5 is a two-byte
3628    character set and is encoded in two-byte.
3629
3630    --- CODE RANGE of BIG5 ---
3631    (character set)      (range)
3632    ASCII                0x00 .. 0x7F
3633    Big5 (1st byte)      0xA1 .. 0xFE
3634         (2nd byte)      0x40 .. 0x7E and 0xA1 .. 0xFE
3635    --------------------------
3636
3637   */
3638
3639 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3640    Check if a text is encoded in SJIS.  If it is, return
3641    CATEGORY_MASK_SJIS, else return 0.  */
3642
3643 static int
3644 detect_coding_sjis (coding, mask)
3645      struct coding_system *coding;
3646      int *mask;
3647 {
3648   unsigned char *src = coding->source, *src_base = src;
3649   unsigned char *src_end = coding->source + coding->src_bytes;
3650   int multibytep = coding->src_multibyte;
3651   int consumed_chars = 0;
3652   int found = 0;
3653   int c;
3654
3655   /* A coding system of this category is always ASCII compatible.  */
3656   src += coding->head_ascii;
3657
3658   while (1)
3659     {
3660       ONE_MORE_BYTE (c);
3661       if (c < 0x80)
3662         continue;
3663       if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3664         {
3665           ONE_MORE_BYTE (c);
3666           if (c < 0x40 || c == 0x7F || c > 0xFC)
3667             break;
3668           found = 1;
3669         }
3670       else if (c >= 0xA0 && c < 0xE0)
3671         found = 1;
3672       else
3673         break;
3674     }
3675   *mask &= ~CATEGORY_MASK_SJIS;
3676   return 0;
3677
3678  no_more_source:
3679   if (!found)
3680     return 0;
3681   *mask &= CATEGORY_MASK_SJIS;
3682   return 1;
3683 }
3684
3685 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3686    Check if a text is encoded in BIG5.  If it is, return
3687    CATEGORY_MASK_BIG5, else return 0.  */
3688
3689 static int
3690 detect_coding_big5 (coding, mask)
3691      struct coding_system *coding;
3692      int *mask;
3693 {
3694   unsigned char *src = coding->source, *src_base = src;
3695   unsigned char *src_end = coding->source + coding->src_bytes;
3696   int multibytep = coding->src_multibyte;
3697   int consumed_chars = 0;
3698   int found = 0;
3699   int c;
3700
3701   /* A coding system of this category is always ASCII compatible.  */
3702   src += coding->head_ascii;
3703
3704   while (1)
3705     {
3706       ONE_MORE_BYTE (c);
3707       if (c < 0x80)
3708         continue;
3709       if (c >= 0xA1)
3710         {
3711           ONE_MORE_BYTE (c);
3712           if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3713             return 0;
3714           found = 1;
3715         }
3716       else
3717         break;
3718     }
3719   *mask &= ~CATEGORY_MASK_BIG5;
3720   return 0;
3721
3722  no_more_source:
3723   if (!found)
3724     return 0;
3725   *mask &= CATEGORY_MASK_BIG5;
3726   return 1;
3727 }
3728
3729 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3730    If SJIS_P is 1, decode SJIS text, else decode BIG5 test.  */
3731
3732 static void
3733 decode_coding_sjis (coding)
3734      struct coding_system *coding;
3735 {
3736   unsigned char *src = coding->source + coding->consumed;
3737   unsigned char *src_end = coding->source + coding->src_bytes;
3738   unsigned char *src_base;
3739   int *charbuf = coding->charbuf;
3740   int *charbuf_end = charbuf + coding->charbuf_size;
3741   int consumed_chars = 0, consumed_chars_base;
3742   int multibytep = coding->src_multibyte;
3743   struct charset *charset_roman, *charset_kanji, *charset_kana;
3744   Lisp_Object attrs, eol_type, charset_list, val;
3745
3746   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3747
3748   val = charset_list;
3749   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3750   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3751   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val)));
3752
3753   while (1)
3754     {
3755       int c, c1;
3756
3757       src_base = src;
3758       consumed_chars_base = consumed_chars;
3759
3760       if (charbuf >= charbuf_end)
3761         break;
3762
3763       ONE_MORE_BYTE (c);
3764
3765       if (c == '\r')
3766         {
3767           if (EQ (eol_type, Qdos))
3768             {
3769               if (src == src_end)
3770                 goto no_more_source;
3771               if (*src == '\n')
3772                 ONE_MORE_BYTE (c);
3773             }
3774           else if (EQ (eol_type, Qmac))
3775             c = '\n';
3776         }
3777       else
3778         {
3779           struct charset *charset;
3780
3781           if (c < 0x80)
3782             charset = charset_roman;
3783           else
3784             {
3785               if (c >= 0xF0)
3786                 goto invalid_code;
3787               if (c < 0xA0 || c >= 0xE0)
3788                 {
3789                   /* SJIS -> JISX0208 */
3790                   ONE_MORE_BYTE (c1);
3791                   if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3792                     goto invalid_code;
3793                   c = (c << 8) | c1;
3794                   SJIS_TO_JIS (c);
3795                   charset = charset_kanji;
3796                 }
3797               else
3798                 /* SJIS -> JISX0201-Kana */
3799                 charset = charset_kana;
3800             }
3801           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3802         }
3803       *charbuf++ = c;
3804       continue;
3805
3806     invalid_code:
3807       src = src_base;
3808       consumed_chars = consumed_chars_base;
3809       ONE_MORE_BYTE (c);
3810       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3811       coding->errors++;
3812     }
3813
3814  no_more_source:
3815   coding->consumed_char += consumed_chars_base;
3816   coding->consumed = src_base - coding->source;
3817   coding->charbuf_used = charbuf - coding->charbuf;
3818 }
3819
3820 static void
3821 decode_coding_big5 (coding)
3822      struct coding_system *coding;
3823 {
3824   unsigned char *src = coding->source + coding->consumed;
3825   unsigned char *src_end = coding->source + coding->src_bytes;
3826   unsigned char *src_base;
3827   int *charbuf = coding->charbuf;
3828   int *charbuf_end = charbuf + coding->charbuf_size;
3829   int consumed_chars = 0, consumed_chars_base;
3830   int multibytep = coding->src_multibyte;
3831   struct charset *charset_roman, *charset_big5;
3832   Lisp_Object attrs, eol_type, charset_list, val;
3833
3834   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3835   val = charset_list;
3836   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3837   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
3838
3839   while (1)
3840     {
3841       int c, c1;
3842
3843       src_base = src;
3844       consumed_chars_base = consumed_chars;
3845
3846       if (charbuf >= charbuf_end)
3847         break;
3848
3849       ONE_MORE_BYTE (c);
3850
3851       if (c == '\r')
3852         {
3853           if (EQ (eol_type, Qdos))
3854             {
3855               if (src == src_end)
3856                 goto no_more_source;
3857               if (*src == '\n')
3858                 ONE_MORE_BYTE (c);
3859             }
3860           else if (EQ (eol_type, Qmac))
3861             c = '\n';
3862         }
3863       else
3864         {
3865           struct charset *charset;
3866           if (c < 0x80)
3867             charset = charset_roman;
3868           else
3869             {
3870               /* BIG5 -> Big5 */
3871               if (c < 0xA1 || c > 0xFE)
3872                 goto invalid_code;
3873               ONE_MORE_BYTE (c1);
3874               if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
3875                 goto invalid_code;
3876               c = c << 8 | c1;
3877               charset = charset_big5;
3878             }
3879           CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
3880         }
3881
3882       *charbuf++ = c;
3883       continue;
3884
3885     invalid_code:
3886       src = src_base;
3887       consumed_chars = consumed_chars_base;
3888       ONE_MORE_BYTE (c);
3889       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3890       coding->errors++;
3891     }
3892
3893  no_more_source:
3894   coding->consumed_char += consumed_chars_base;
3895   coding->consumed = src_base - coding->source;
3896   coding->charbuf_used = charbuf - coding->charbuf;
3897 }
3898
3899 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
3900    This function can encode charsets `ascii', `katakana-jisx0201',
3901    `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'.  We
3902    are sure that all these charsets are registered as official charset
3903    (i.e. do not have extended leading-codes).  Characters of other
3904    charsets are produced without any encoding.  If SJIS_P is 1, encode
3905    SJIS text, else encode BIG5 text.  */
3906
3907 static int
3908 encode_coding_sjis (coding)
3909      struct coding_system *coding;
3910 {
3911   int multibytep = coding->dst_multibyte;
3912   int *charbuf = coding->charbuf;
3913   int *charbuf_end = charbuf + coding->charbuf_used;
3914   unsigned char *dst = coding->destination + coding->produced;
3915   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3916   int safe_room = 4;
3917   int produced_chars = 0;
3918   Lisp_Object attrs, eol_type, charset_list, val;
3919   int ascii_compatible;
3920   struct charset *charset_roman, *charset_kanji, *charset_kana;
3921   int c;
3922
3923   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3924   val = charset_list;
3925   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3926   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3927   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3928
3929   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3930
3931   while (charbuf < charbuf_end)
3932     {
3933       ASSURE_DESTINATION (safe_room);
3934       c = *charbuf++;
3935       /* Now encode the character C.  */
3936       if (ASCII_CHAR_P (c) && ascii_compatible)
3937         EMIT_ONE_ASCII_BYTE (c);
3938       else if (CHAR_BYTE8_P (c))
3939         {
3940           c = CHAR_TO_BYTE8 (c);
3941           EMIT_ONE_BYTE (c);
3942         }
3943       else
3944         {
3945           unsigned code;
3946           struct charset *charset = char_charset (c, charset_list, &code);
3947
3948           if (!charset)
3949             {
3950               if (coding->mode & CODING_MODE_SAFE_ENCODING)
3951                 {
3952                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3953                   charset = CHARSET_FROM_ID (charset_ascii);
3954                 }
3955               else
3956                 {
3957                   c = coding->default_char;
3958                   charset = char_charset (c, charset_list, &code);
3959                 }
3960             }
3961           if (code == CHARSET_INVALID_CODE (charset))
3962             abort ();
3963           if (charset == charset_kanji)
3964             {
3965               int c1, c2;
3966               JIS_TO_SJIS (code);
3967               c1 = code >> 8, c2 = code & 0xFF;
3968               EMIT_TWO_BYTES (c1, c2);
3969             }
3970           else if (charset == charset_kana)
3971             EMIT_ONE_BYTE (code | 0x80);
3972           else
3973             EMIT_ONE_ASCII_BYTE (code & 0x7F);
3974         }
3975     }
3976   coding->result = CODING_RESULT_SUCCESS;
3977   coding->produced_char += produced_chars;
3978   coding->produced = dst - coding->destination;
3979   return 0;
3980 }
3981
3982 static int
3983 encode_coding_big5 (coding)
3984      struct coding_system *coding;
3985 {
3986   int multibytep = coding->dst_multibyte;
3987   int *charbuf = coding->charbuf;
3988   int *charbuf_end = charbuf + coding->charbuf_used;
3989   unsigned char *dst = coding->destination + coding->produced;
3990   unsigned char *dst_end = coding->destination + coding->dst_bytes;
3991   int safe_room = 4;
3992   int produced_chars = 0;
3993   Lisp_Object attrs, eol_type, charset_list, val;
3994   int ascii_compatible;
3995   struct charset *charset_roman, *charset_big5;
3996   int c;
3997
3998   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3999   val = charset_list;
4000   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4001   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4002   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4003
4004   while (charbuf < charbuf_end)
4005     {
4006       ASSURE_DESTINATION (safe_room);
4007       c = *charbuf++;
4008       /* Now encode the character C.  */
4009       if (ASCII_CHAR_P (c) && ascii_compatible)
4010         EMIT_ONE_ASCII_BYTE (c);
4011       else if (CHAR_BYTE8_P (c))
4012         {
4013           c = CHAR_TO_BYTE8 (c);
4014           EMIT_ONE_BYTE (c);
4015         }
4016       else
4017         {
4018           unsigned code;
4019           struct charset *charset = char_charset (c, charset_list, &code);
4020
4021           if (! charset)
4022             {
4023               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4024                 {
4025                   code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4026                   charset = CHARSET_FROM_ID (charset_ascii);
4027                 }
4028               else
4029                 {
4030                   c = coding->default_char;
4031                   charset = char_charset (c, charset_list, &code);
4032                 }
4033             }
4034           if (code == CHARSET_INVALID_CODE (charset))
4035             abort ();
4036           if (charset == charset_big5)
4037             {
4038               int c1, c2;
4039
4040               c1 = code >> 8, c2 = code & 0xFF;
4041               EMIT_TWO_BYTES (c1, c2);
4042             }
4043           else
4044             EMIT_ONE_ASCII_BYTE (code & 0x7F);
4045         }
4046     }
4047   coding->result = CODING_RESULT_SUCCESS;
4048   coding->produced_char += produced_chars;
4049   coding->produced = dst - coding->destination;
4050   return 0;
4051 }
4052
4053 \f
4054 /*** 10. CCL handlers ***/
4055
4056 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4057    Check if a text is encoded in a coding system of which
4058    encoder/decoder are written in CCL program.  If it is, return
4059    CATEGORY_MASK_CCL, else return 0.  */
4060
4061 static int
4062 detect_coding_ccl (coding, mask)
4063      struct coding_system *coding;
4064      int *mask;
4065 {
4066   unsigned char *src = coding->source, *src_base = src;
4067   unsigned char *src_end = coding->source + coding->src_bytes;
4068   int multibytep = coding->src_multibyte;
4069   int consumed_chars = 0;
4070   int found = 0;
4071   unsigned char *valids = CODING_CCL_VALIDS (coding);
4072   int head_ascii = coding->head_ascii;
4073   Lisp_Object attrs;
4074
4075   coding = &coding_categories[coding_category_ccl];
4076   attrs = CODING_ID_ATTRS (coding->id);
4077   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4078     src += head_ascii;
4079
4080   while (1)
4081     {
4082       int c;
4083       ONE_MORE_BYTE (c);
4084       if (! valids[c])
4085         break;
4086       if (!found && valids[c] > 1)
4087         found = 1;
4088     }
4089   *mask &= ~CATEGORY_MASK_CCL;
4090   return 0;
4091
4092  no_more_source:
4093   if (!found)
4094     return 0;
4095   *mask &= CATEGORY_MASK_CCL;
4096   return 1;
4097 }
4098
4099 static void
4100 decode_coding_ccl (coding)
4101      struct coding_system *coding;
4102 {
4103   unsigned char *src = coding->source + coding->consumed;
4104   unsigned char *src_end = coding->source + coding->src_bytes;
4105   int *charbuf = coding->charbuf;
4106   int *charbuf_end = charbuf + coding->charbuf_size;
4107   int consumed_chars = 0;
4108   int multibytep = coding->src_multibyte;
4109   struct ccl_program ccl;
4110   int source_charbuf[1024];
4111   int source_byteidx[1024];
4112
4113   setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4114
4115   while (src < src_end)
4116     {
4117       unsigned char *p = src;
4118       int *source, *source_end;
4119       int i = 0;
4120
4121       if (multibytep)
4122         while (i < 1024 && p < src_end)
4123           {
4124             source_byteidx[i] = p - src;
4125             source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4126           }
4127       else
4128         while (i < 1024 && p < src_end)
4129           source_charbuf[i++] = *p++;
4130
4131       if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4132         ccl.last_block = 1;
4133
4134       source = source_charbuf;
4135       source_end = source + i;
4136       while (source < source_end)
4137         {
4138           ccl_driver (&ccl, source, charbuf,
4139                       source_end - source, charbuf_end - charbuf);
4140           source += ccl.consumed;
4141           charbuf += ccl.produced;
4142           if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4143             break;
4144         }
4145       if (source < source_end)
4146         src += source_byteidx[source - source_charbuf];
4147       else
4148         src = p;
4149       consumed_chars += source - source_charbuf;
4150
4151       if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4152           && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4153         break;
4154     }
4155
4156   switch (ccl.status)
4157     {
4158     case CCL_STAT_SUSPEND_BY_SRC:
4159       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4160       break;
4161     case CCL_STAT_SUSPEND_BY_DST:
4162       break;
4163     case CCL_STAT_QUIT:
4164     case CCL_STAT_INVALID_CMD:
4165       coding->result = CODING_RESULT_INTERRUPT;
4166       break;
4167     default:
4168       coding->result = CODING_RESULT_SUCCESS;
4169       break;
4170     }
4171   coding->consumed_char += consumed_chars;
4172   coding->consumed = src - coding->source;
4173   coding->charbuf_used = charbuf - coding->charbuf;
4174 }
4175
4176 static int
4177 encode_coding_ccl (coding)
4178      struct coding_system *coding;
4179 {
4180   struct ccl_program ccl;
4181   int multibytep = coding->dst_multibyte;
4182   int *charbuf = coding->charbuf;
4183   int *charbuf_end = charbuf + coding->charbuf_used;
4184   unsigned char *dst = coding->destination + coding->produced;
4185   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4186   unsigned char *adjusted_dst_end = dst_end - 1;
4187   int destination_charbuf[1024];
4188   int i, produced_chars = 0;
4189
4190   setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4191
4192   ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4193   ccl.dst_multibyte = coding->dst_multibyte;
4194
4195   while (charbuf < charbuf_end && dst < adjusted_dst_end)
4196     {
4197       int dst_bytes = dst_end - dst;
4198       if (dst_bytes > 1024)
4199         dst_bytes = 1024;
4200
4201       ccl_driver (&ccl, charbuf, destination_charbuf,
4202                   charbuf_end - charbuf, dst_bytes);
4203       charbuf += ccl.consumed;
4204       if (multibytep)
4205         for (i = 0; i < ccl.produced; i++)
4206           EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4207       else
4208         {
4209           for (i = 0; i < ccl.produced; i++)
4210             *dst++ = destination_charbuf[i] & 0xFF;
4211           produced_chars += ccl.produced;
4212         }
4213     }
4214
4215   switch (ccl.status)
4216     {
4217     case CCL_STAT_SUSPEND_BY_SRC:
4218       coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4219       break;
4220     case CCL_STAT_SUSPEND_BY_DST:
4221       coding->result = CODING_RESULT_INSUFFICIENT_DST;
4222       break;
4223     case CCL_STAT_QUIT:
4224     case CCL_STAT_INVALID_CMD:
4225       coding->result = CODING_RESULT_INTERRUPT;
4226       break;
4227     default:
4228       coding->result = CODING_RESULT_SUCCESS;
4229       break;
4230     }
4231
4232   coding->produced_char += produced_chars;
4233   coding->produced = dst - coding->destination;
4234   return 0;
4235 }
4236
4237
4238 \f
4239 /*** 10, 11. no-conversion handlers ***/
4240
4241 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".  */
4242
4243 static void
4244 decode_coding_raw_text (coding)
4245      struct coding_system *coding;
4246 {
4247   coding->chars_at_source = 1;
4248   coding->consumed_char = 0;
4249   coding->consumed = 0;
4250   coding->result = CODING_RESULT_SUCCESS;
4251 }
4252
4253 static int
4254 encode_coding_raw_text (coding)
4255      struct coding_system *coding;
4256 {
4257   int multibytep = coding->dst_multibyte;
4258   int *charbuf = coding->charbuf;
4259   int *charbuf_end = coding->charbuf + coding->charbuf_used;
4260   unsigned char *dst = coding->destination + coding->produced;
4261   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4262   int produced_chars = 0;
4263   int c;
4264
4265   if (multibytep)
4266     {
4267       int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4268
4269       if (coding->src_multibyte)
4270         while (charbuf < charbuf_end)
4271           {
4272             ASSURE_DESTINATION (safe_room);
4273             c = *charbuf++;
4274             if (ASCII_CHAR_P (c))
4275               EMIT_ONE_ASCII_BYTE (c);
4276             else if (CHAR_BYTE8_P (c))
4277               {
4278                 c = CHAR_TO_BYTE8 (c);
4279                 EMIT_ONE_BYTE (c);
4280               }
4281             else
4282               {
4283                 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4284
4285                 CHAR_STRING_ADVANCE (c, p1);
4286                 while (p0 < p1)
4287                   EMIT_ONE_BYTE (*p0);
4288               }
4289           }
4290       else
4291         while (charbuf < charbuf_end)
4292           {
4293             ASSURE_DESTINATION (safe_room);
4294             c = *charbuf++;
4295             EMIT_ONE_BYTE (c);
4296           }
4297     }
4298   else
4299     {
4300       if (coding->src_multibyte)
4301         {
4302           int safe_room = MAX_MULTIBYTE_LENGTH;
4303
4304           while (charbuf < charbuf_end)
4305             {
4306               ASSURE_DESTINATION (safe_room);
4307               c = *charbuf++;
4308               if (ASCII_CHAR_P (c))
4309                 *dst++ = c;
4310               else if (CHAR_BYTE8_P (c))
4311                 *dst++ = CHAR_TO_BYTE8 (c);
4312               else
4313                 CHAR_STRING_ADVANCE (c, dst);
4314               produced_chars++;
4315             }
4316         }
4317       else
4318         {
4319           ASSURE_DESTINATION (charbuf_end - charbuf);
4320           while (charbuf < charbuf_end && dst < dst_end)
4321             *dst++ = *charbuf++;
4322           produced_chars = dst - (coding->destination + coding->dst_bytes);
4323         }
4324     }
4325   coding->result = CODING_RESULT_SUCCESS;
4326   coding->produced_char += produced_chars;
4327   coding->produced = dst - coding->destination;
4328   return 0;
4329 }
4330
4331 static int
4332 detect_coding_charset (coding, mask)
4333      struct coding_system *coding;
4334      int *mask;
4335 {
4336   unsigned char *src = coding->source, *src_base = src;
4337   unsigned char *src_end = coding->source + coding->src_bytes;
4338   int multibytep = coding->src_multibyte;
4339   int consumed_chars = 0;
4340   Lisp_Object attrs, valids;
4341
4342   coding = &coding_categories[coding_category_charset];
4343   attrs = CODING_ID_ATTRS (coding->id);
4344   valids = AREF (attrs, coding_attr_charset_valids);
4345
4346   if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4347     src += coding->head_ascii;
4348
4349   while (1)
4350     {
4351       int c;
4352
4353       ONE_MORE_BYTE (c);
4354       if (NILP (AREF (valids, c)))
4355         break;
4356     }
4357   *mask &= ~CATEGORY_MASK_CHARSET;
4358   return 0;
4359
4360  no_more_source:
4361   *mask &= CATEGORY_MASK_CHARSET;
4362   return 1;
4363 }
4364
4365 static void
4366 decode_coding_charset (coding)
4367      struct coding_system *coding;
4368 {
4369   unsigned char *src = coding->source + coding->consumed;
4370   unsigned char *src_end = coding->source + coding->src_bytes;
4371   unsigned char *src_base;
4372   int *charbuf = coding->charbuf;
4373   int *charbuf_end = charbuf + coding->charbuf_size;
4374   int consumed_chars = 0, consumed_chars_base;
4375   int multibytep = coding->src_multibyte;
4376   Lisp_Object attrs, eol_type, charset_list, valids;
4377
4378   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4379   valids = AREF (attrs, coding_attr_charset_valids);
4380
4381   while (1)
4382     {
4383       int c;
4384
4385       src_base = src;
4386       consumed_chars_base = consumed_chars;
4387
4388       if (charbuf >= charbuf_end)
4389         break;
4390
4391       ONE_MORE_BYTE (c);
4392       if (c == '\r')
4393         {
4394           /* Here we assume that no charset maps '\r' to something
4395              else.  */
4396           if (EQ (eol_type, Qdos))
4397             {
4398               if (src < src_end
4399                   && *src == '\n')
4400                 ONE_MORE_BYTE (c);
4401             }
4402           else if (EQ (eol_type, Qmac))
4403             c = '\n';
4404         }
4405       else
4406         {
4407           Lisp_Object val;
4408           struct charset *charset;
4409           int dim;
4410           int len = 1;
4411           unsigned code = c;
4412
4413           val = AREF (valids, c);
4414           if (NILP (val))
4415             goto invalid_code;
4416           if (INTEGERP (val))
4417             {
4418               charset = CHARSET_FROM_ID (XFASTINT (val));
4419               dim = CHARSET_DIMENSION (charset);
4420               while (len < dim)
4421                 {
4422                   ONE_MORE_BYTE (c);
4423                   code = (code << 8) | c;
4424                   len++;
4425                 }
4426               CODING_DECODE_CHAR (coding, src, src_base, src_end,
4427                                   charset, code, c);
4428             }
4429           else
4430             {
4431               /* VAL is a list of charset IDs.  It is assured that the
4432                  list is sorted by charset dimensions (smaller one
4433                  comes first).  */
4434               while (CONSP (val))
4435                 {
4436                   charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4437                   dim = CHARSET_DIMENSION (charset);
4438                   while (len < dim)
4439                     {
4440                       ONE_MORE_BYTE (c);
4441                       code = (code << 8) | c;
4442                       len++;
4443                     }
4444                   CODING_DECODE_CHAR (coding, src, src_base,
4445                                       src_end, charset, code, c);
4446                   if (c >= 0)
4447                     break;
4448                   val = XCDR (val);
4449                 }
4450             }
4451           if (c < 0)
4452             goto invalid_code;
4453         }
4454       *charbuf++ = c;
4455       continue;
4456
4457     invalid_code:
4458       src = src_base;
4459       consumed_chars = consumed_chars_base;
4460       ONE_MORE_BYTE (c);
4461       *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4462       coding->errors++;
4463     }
4464
4465  no_more_source:
4466   coding->consumed_char += consumed_chars_base;
4467   coding->consumed = src_base - coding->source;
4468   coding->charbuf_used = charbuf - coding->charbuf;
4469 }
4470
4471 static int
4472 encode_coding_charset (coding)
4473      struct coding_system *coding;
4474 {
4475   int multibytep = coding->dst_multibyte;
4476   int *charbuf = coding->charbuf;
4477   int *charbuf_end = charbuf + coding->charbuf_used;
4478   unsigned char *dst = coding->destination + coding->produced;
4479   unsigned char *dst_end = coding->destination + coding->dst_bytes;
4480   int safe_room = MAX_MULTIBYTE_LENGTH;
4481   int produced_chars = 0;
4482   Lisp_Object attrs, eol_type, charset_list;
4483   int ascii_compatible;
4484   int c;
4485
4486   CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4487   ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4488
4489   while (charbuf < charbuf_end)
4490     {
4491       struct charset *charset;
4492       unsigned code;
4493
4494       ASSURE_DESTINATION (safe_room);
4495       c = *charbuf++;
4496       if (ascii_compatible && ASCII_CHAR_P (c))
4497         EMIT_ONE_ASCII_BYTE (c);
4498       else if (CHAR_BYTE8_P (c))
4499         {
4500           c = CHAR_TO_BYTE8 (c);
4501           EMIT_ONE_BYTE (c);
4502         }
4503       else
4504         {
4505           charset = char_charset (c, charset_list, &code);
4506           if (charset)
4507             {
4508               if (CHARSET_DIMENSION (charset) == 1)
4509                 EMIT_ONE_BYTE (code);
4510               else if (CHARSET_DIMENSION (charset) == 2)
4511                 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4512               else if (CHARSET_DIMENSION (charset) == 3)
4513                 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4514               else
4515                 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4516                                  (code >> 8) & 0xFF, code & 0xFF);
4517             }
4518           else
4519             {
4520               if (coding->mode & CODING_MODE_SAFE_ENCODING)
4521                 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4522               else
4523                 c = coding->default_char;
4524               EMIT_ONE_BYTE (c);
4525             }
4526         }
4527     }
4528
4529   coding->result = CODING_RESULT_SUCCESS;
4530   coding->produced_char += produced_chars;
4531   coding->produced = dst - coding->destination;
4532   return 0;
4533 }
4534
4535 \f
4536 /*** 7. C library functions ***/
4537
4538 /* In Emacs Lisp, coding system is represented by a Lisp symbol which
4539    has a property `coding-system'.  The value of this property is a
4540    vector of length 5 (called as coding-vector).  Among elements of
4541    this vector, the first (element[0]) and the fifth (element[4])
4542    carry important information for decoding/encoding.  Before
4543    decoding/encoding, this information should be set in fields of a
4544    structure of type `coding_system'.
4545
4546    A value of property `coding-system' can be a symbol of another
4547    subsidiary coding-system.  In that case, Emacs gets coding-vector
4548    from that symbol.
4549
4550    `element[0]' contains information to be set in `coding->type'.  The
4551    value and its meaning is as follows:
4552
4553    0 -- coding_type_emacs_mule
4554    1 -- coding_type_sjis
4555    2 -- coding_type_iso_2022
4556    3 -- coding_type_big5
4557    4 -- coding_type_ccl encoder/decoder written in CCL
4558    nil -- coding_type_no_conversion
4559    t -- coding_type_undecided (automatic conversion on decoding,
4560                                no-conversion on encoding)
4561
4562    `element[4]' contains information to be set in `coding->flags' and
4563    `coding->spec'.  The meaning varies by `coding->type'.
4564
4565    If `coding->type' is `coding_type_iso_2022', element[4] is a vector
4566    of length 32 (of which the first 13 sub-elements are used now).
4567    Meanings of these sub-elements are:
4568
4569    sub-element[N] where N is 0 through 3: to be set in `coding->spec.iso_2022'
4570         If the value is an integer of valid charset, the charset is
4571         assumed to be designated to graphic register N initially.
4572
4573         If the value is minus, it is a minus value of charset which
4574         reserves graphic register N, which means that the charset is
4575         not designated initially but should be designated to graphic
4576         register N just before encoding a character in that charset.
4577
4578         If the value is nil, graphic register N is never used on
4579         encoding.
4580
4581    sub-element[N] where N is 4 through 11: to be set in `coding->flags'
4582         Each value takes t or nil.  See the section ISO2022 of
4583         `coding.h' for more information.
4584
4585    If `coding->type' is `coding_type_big5', element[4] is t to denote
4586    BIG5-ETen or nil to denote BIG5-HKU.
4587
4588    If `coding->type' takes the other value, element[4] is ignored.
4589
4590    Emacs Lisp's coding system also carries information about format of
4591    end-of-line in a value of property `eol-type'.  If the value is
4592    integer, 0 means eol_lf, 1 means eol_crlf, and 2 means eol_cr.  If
4593    it is not integer, it should be a vector of subsidiary coding
4594    systems of which property `eol-type' has one of above values.
4595
4596 */
4597
4598 /* Setup coding context CODING from information about CODING_SYSTEM.
4599    If CODING_SYSTEM is nil, `no-conversion' is assumed.  If
4600    CODING_SYSTEM is invalid, signal an error.  */
4601
4602 void
4603 setup_coding_system (coding_system, coding)
4604      Lisp_Object coding_system;
4605      struct coding_system *coding;
4606 {
4607   Lisp_Object attrs;
4608   Lisp_Object eol_type;
4609   Lisp_Object coding_type;
4610   Lisp_Object val;
4611
4612   if (NILP (coding_system))
4613     coding_system = Qno_conversion;
4614
4615   CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4616
4617   attrs = CODING_ID_ATTRS (coding->id);
4618   eol_type = CODING_ID_EOL_TYPE (coding->id);
4619
4620   coding->mode = 0;
4621   coding->head_ascii = -1;
4622   coding->common_flags
4623     = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4624
4625   val = CODING_ATTR_SAFE_CHARSETS (attrs);
4626   coding->max_charset_id = XSTRING (val)->size - 1;
4627   coding->safe_charsets = (char *) XSTRING (val)->data;
4628   coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4629
4630   coding_type = CODING_ATTR_TYPE (attrs);
4631   if (EQ (coding_type, Qundecided))
4632     {
4633       coding->detector = NULL;
4634       coding->decoder = decode_coding_raw_text;
4635       coding->encoder = encode_coding_raw_text;
4636       coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4637     }
4638   else if (EQ (coding_type, Qiso_2022))
4639     {
4640       int i;
4641       int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4642
4643       /* Invoke graphic register 0 to plane 0.  */
4644       CODING_ISO_INVOCATION (coding, 0) = 0;
4645       /* Invoke graphic register 1 to plane 1 if we can use 8-bit.  */
4646       CODING_ISO_INVOCATION (coding, 1)
4647         = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4648       /* Setup the initial status of designation.  */
4649       for (i = 0; i < 4; i++)
4650         CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4651       /* Not single shifting initially.  */
4652       CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4653       /* Beginning of buffer should also be regarded as bol. */
4654       CODING_ISO_BOL (coding) = 1;
4655       coding->detector = detect_coding_iso_2022;
4656       coding->decoder = decode_coding_iso_2022;
4657       coding->encoder = encode_coding_iso_2022;
4658       if (flags & CODING_ISO_FLAG_SAFE)
4659         coding->mode |= CODING_MODE_SAFE_ENCODING;
4660       coding->common_flags
4661         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4662             | CODING_REQUIRE_FLUSHING_MASK);
4663       if (flags & CODING_ISO_FLAG_COMPOSITION)
4664         coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4665       if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4666         {
4667           setup_iso_safe_charsets (attrs);
4668           val = CODING_ATTR_SAFE_CHARSETS (attrs);
4669           coding->max_charset_id = XSTRING (val)->size - 1;
4670           coding->safe_charsets = (char *) XSTRING (val)->data;
4671         }
4672       CODING_ISO_FLAGS (coding) = flags;
4673     }
4674   else if (EQ (coding_type, Qcharset))
4675     {
4676       coding->detector = detect_coding_charset;
4677       coding->decoder = decode_coding_charset;
4678       coding->encoder = encode_coding_charset;
4679       coding->common_flags
4680         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4681     }
4682   else if (EQ (coding_type, Qutf_8))
4683     {
4684       coding->detector = detect_coding_utf_8;
4685       coding->decoder = decode_coding_utf_8;
4686       coding->encoder = encode_coding_utf_8;
4687       coding->common_flags
4688         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4689     }
4690   else if (EQ (coding_type, Qutf_16))
4691     {
4692       val = AREF (attrs, coding_attr_utf_16_bom);
4693       CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4694                                     : EQ (val, Qt) ? utf_16_with_bom
4695                                     : utf_16_without_bom);
4696       val = AREF (attrs, coding_attr_utf_16_endian);
4697       CODING_UTF_16_ENDIAN (coding) = (NILP (val) ? utf_16_big_endian
4698                                        : utf_16_little_endian);
4699       CODING_UTF_16_SURROGATE (coding) = 0;
4700       coding->detector = detect_coding_utf_16;
4701       coding->decoder = decode_coding_utf_16;
4702       coding->encoder = encode_coding_utf_16;
4703       coding->common_flags
4704         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4705     }
4706   else if (EQ (coding_type, Qccl))
4707     {
4708       coding->detector = detect_coding_ccl;
4709       coding->decoder = decode_coding_ccl;
4710       coding->encoder = encode_coding_ccl;
4711       coding->common_flags
4712         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4713             | CODING_REQUIRE_FLUSHING_MASK);
4714     }
4715   else if (EQ (coding_type, Qemacs_mule))
4716     {
4717       coding->detector = detect_coding_emacs_mule;
4718       coding->decoder = decode_coding_emacs_mule;
4719       coding->encoder = encode_coding_emacs_mule;
4720       coding->common_flags
4721         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4722       if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4723           && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4724         {
4725           Lisp_Object tail, safe_charsets;
4726           int max_charset_id = 0;
4727
4728           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4729                tail = XCDR (tail))
4730             if (max_charset_id < XFASTINT (XCAR (tail)))
4731               max_charset_id = XFASTINT (XCAR (tail));
4732           safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4733                                         make_number (255));
4734           for (tail = Vemacs_mule_charset_list; CONSP (tail);
4735                tail = XCDR (tail))
4736             XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4737           coding->max_charset_id = max_charset_id;
4738           coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4739         }
4740     }
4741   else if (EQ (coding_type, Qshift_jis))
4742     {
4743       coding->detector = detect_coding_sjis;
4744       coding->decoder = decode_coding_sjis;
4745       coding->encoder = encode_coding_sjis;
4746       coding->common_flags
4747         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4748     }
4749   else if (EQ (coding_type, Qbig5))
4750     {
4751       coding->detector = detect_coding_big5;
4752       coding->decoder = decode_coding_big5;
4753       coding->encoder = encode_coding_big5;
4754       coding->common_flags
4755         |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4756     }
4757   else                          /* EQ (coding_type, Qraw_text) */
4758     {
4759       coding->detector = NULL;
4760       coding->decoder = decode_coding_raw_text;
4761       coding->encoder = encode_coding_raw_text;
4762       coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4763     }
4764
4765   return;
4766 }
4767
4768 /* Return raw-text or one of its subsidiaries that has the same
4769    eol_type as CODING-SYSTEM.  */
4770
4771 Lisp_Object
4772 raw_text_coding_system (coding_system)
4773      Lisp_Object coding_system;
4774 {
4775   Lisp_Object spec, attrs;
4776   Lisp_Object eol_type, raw_text_eol_type;
4777
4778   spec = CODING_SYSTEM_SPEC (coding_system);
4779   attrs = AREF (spec, 0);
4780
4781   if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4782     return coding_system;
4783
4784   eol_type = AREF (spec, 2);
4785   if (VECTORP (eol_type))
4786     return Qraw_text;
4787   spec = CODING_SYSTEM_SPEC (Qraw_text);
4788   raw_text_eol_type = AREF (spec, 2);
4789   return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
4790           : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
4791           : AREF (raw_text_eol_type, 2));
4792 }
4793
4794
4795 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
4796    does, return one of the subsidiary that has the same eol-spec as
4797    PARENT.  Otherwise, return CODING_SYSTEM.  */
4798
4799 Lisp_Object
4800 coding_inherit_eol_type (coding_system, parent)
4801      Lisp_Object coding_system, parent;
4802 {
4803   Lisp_Object spec, attrs, eol_type;
4804
4805   spec = CODING_SYSTEM_SPEC (coding_system);
4806   attrs = AREF (spec, 0);
4807   eol_type = AREF (spec, 2);
4808   if (VECTORP (eol_type))
4809     {
4810       Lisp_Object parent_spec;
4811       Lisp_Object parent_eol_type;
4812
4813       parent_spec
4814         = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
4815       parent_eol_type = AREF (parent_spec, 2);
4816       if (EQ (parent_eol_type, Qunix))
4817         coding_system = AREF (eol_type, 0);
4818       else if (EQ (parent_eol_type, Qdos))
4819         coding_system = AREF (eol_type, 1);
4820       else if (EQ (parent_eol_type, Qmac))
4821         coding_system = AREF (eol_type, 2);
4822     }
4823   return coding_system;
4824 }
4825
4826 /* Emacs has a mechanism to automatically detect a coding system if it
4827    is one of Emacs' internal format, ISO2022, SJIS, and BIG5.  But,
4828    it's impossible to distinguish some coding systems accurately
4829    because they use the same range of codes.  So, at first, coding
4830    systems are categorized into 7, those are:
4831
4832    o coding-category-emacs-mule
4833
4834         The category for a coding system which has the same code range
4835         as Emacs' internal format.  Assigned the coding-system (Lisp
4836         symbol) `emacs-mule' by default.
4837
4838    o coding-category-sjis
4839
4840         The category for a coding system which has the same code range
4841         as SJIS.  Assigned the coding-system (Lisp
4842         symbol) `japanese-shift-jis' by default.
4843
4844    o coding-category-iso-7
4845
4846         The category for a coding system which has the same code range
4847         as ISO2022 of 7-bit environment.  This doesn't use any locking
4848         shift and single shift functions.  This can encode/decode all
4849         charsets.  Assigned the coding-system (Lisp symbol)
4850         `iso-2022-7bit' by default.
4851
4852    o coding-category-iso-7-tight
4853
4854         Same as coding-category-iso-7 except that this can
4855         encode/decode only the specified charsets.
4856
4857    o coding-category-iso-8-1
4858
4859         The category for a coding system which has the same code range
4860         as ISO2022 of 8-bit environment and graphic plane 1 used only
4861         for DIMENSION1 charset.  This doesn't use any locking shift
4862         and single shift functions.  Assigned the coding-system (Lisp
4863         symbol) `iso-latin-1' by default.
4864
4865    o coding-category-iso-8-2
4866
4867         The category for a coding system which has the same code range
4868         as ISO2022 of 8-bit environment and graphic plane 1 used only
4869         for DIMENSION2 charset.  This doesn't use any locking shift
4870         and single shift functions.  Assigned the coding-system (Lisp
4871         symbol) `japanese-iso-8bit' by default.
4872
4873    o coding-category-iso-7-else
4874
4875         The category for a coding system which has the same code range
4876         as ISO2022 of 7-bit environemnt but uses locking shift or
4877         single shift functions.  Assigned the coding-system (Lisp
4878         symbol) `iso-2022-7bit-lock' by default.
4879
4880    o coding-category-iso-8-else
4881
4882         The category for a coding system which has the same code range
4883         as ISO2022 of 8-bit environemnt but uses locking shift or
4884         single shift functions.  Assigned the coding-system (Lisp
4885         symbol) `iso-2022-8bit-ss2' by default.
4886
4887    o coding-category-big5
4888
4889         The category for a coding system which has the same code range
4890         as BIG5.  Assigned the coding-system (Lisp symbol)
4891         `cn-big5' by default.
4892
4893    o coding-category-utf-8
4894
4895         The category for a coding system which has the same code range
4896         as UTF-8 (cf. RFC2279).  Assigned the coding-system (Lisp
4897         symbol) `utf-8' by default.
4898
4899    o coding-category-utf-16-be
4900
4901         The category for a coding system in which a text has an
4902         Unicode signature (cf. Unicode Standard) in the order of BIG
4903         endian at the head.  Assigned the coding-system (Lisp symbol)
4904         `utf-16-be' by default.
4905
4906    o coding-category-utf-16-le
4907
4908         The category for a coding system in which a text has an
4909         Unicode signature (cf. Unicode Standard) in the order of
4910         LITTLE endian at the head.  Assigned the coding-system (Lisp
4911         symbol) `utf-16-le' by default.
4912
4913    o coding-category-ccl
4914
4915         The category for a coding system of which encoder/decoder is
4916         written in CCL programs.  The default value is nil, i.e., no
4917         coding system is assigned.
4918
4919    o coding-category-binary
4920
4921         The category for a coding system not categorized in any of the
4922         above.  Assigned the coding-system (Lisp symbol)
4923         `no-conversion' by default.
4924
4925    Each of them is a Lisp symbol and the value is an actual
4926    `coding-system's (this is also a Lisp symbol) assigned by a user.
4927    What Emacs does actually is to detect a category of coding system.
4928    Then, it uses a `coding-system' assigned to it.  If Emacs can't
4929    decide only one possible category, it selects a category of the
4930    highest priority.  Priorities of categories are also specified by a
4931    user in a Lisp variable `coding-category-list'.
4932
4933 */
4934
4935 #define EOL_SEEN_NONE   0
4936 #define EOL_SEEN_LF     1
4937 #define EOL_SEEN_CR     2
4938 #define EOL_SEEN_CRLF   4
4939
4940 /* Detect how end-of-line of a text of length CODING->src_bytes
4941    pointed by CODING->source is encoded.  Return one of
4942    EOL_SEEN_XXX.  */
4943
4944 #define MAX_EOL_CHECK_COUNT 3
4945
4946 static int
4947 detect_eol (coding, source, src_bytes)
4948      struct coding_system *coding;
4949      unsigned char *source;
4950      EMACS_INT src_bytes;
4951 {
4952   Lisp_Object attrs, coding_type;
4953   unsigned char *src = source, *src_end = src + src_bytes;
4954   unsigned char c;
4955   int total  = 0;
4956   int eol_seen = EOL_SEEN_NONE;
4957
4958   attrs = CODING_ID_ATTRS (coding->id);
4959   coding_type = CODING_ATTR_TYPE (attrs);
4960
4961   if (EQ (coding_type, Qccl))
4962     {
4963       int msb, lsb;
4964
4965       msb = coding->spec.utf_16.endian == utf_16_little_endian;
4966       lsb = 1 - msb;
4967
4968       while (src + 1 < src_end)
4969         {
4970           c = src[lsb];
4971           if (src[msb] == 0 && (c == '\n' || c == '\r'))
4972             {
4973               int this_eol;
4974
4975               if (c == '\n')
4976                 this_eol = EOL_SEEN_LF;
4977               else if (src + 3 >= src_end
4978                        || src[msb + 2] != 0
4979                        || src[lsb + 2] != '\n')
4980                 this_eol = EOL_SEEN_CR;
4981               else
4982                 this_eol = EOL_SEEN_CRLF;
4983
4984               if (eol_seen == EOL_SEEN_NONE)
4985                 /* This is the first end-of-line.  */
4986                 eol_seen = this_eol;
4987               else if (eol_seen != this_eol)
4988                 {
4989                   /* The found type is different from what found before.  */
4990                   eol_seen = EOL_SEEN_LF;
4991                   break;
4992                 }
4993               if (++total == MAX_EOL_CHECK_COUNT)
4994                 break;
4995             }
4996           src += 2;
4997         }
4998     }
4999   else
5000     {
5001       while (src < src_end)
5002         {
5003           c = *src++;
5004           if (c == '\n' || c == '\r')
5005             {
5006               int this_eol;
5007
5008               if (c == '\n')
5009                 this_eol = EOL_SEEN_LF;
5010               else if (src >= src_end || *src != '\n')
5011                 this_eol = EOL_SEEN_CR;
5012               else
5013                 this_eol = EOL_SEEN_CRLF, src++;
5014
5015               if (eol_seen == EOL_SEEN_NONE)
5016                 /* This is the first end-of-line.  */
5017                 eol_seen = this_eol;
5018               else if (eol_seen != this_eol)
5019                 {
5020                   /* The found type is different from what found before.  */
5021                   eol_seen = EOL_SEEN_LF;
5022                   break;
5023                 }
5024               if (++total == MAX_EOL_CHECK_COUNT)
5025                 break;
5026             }
5027         }
5028     }
5029   return eol_seen;
5030 }
5031
5032
5033 static void
5034 adjust_coding_eol_type (coding, eol_seen)
5035      struct coding_system *coding;
5036      int eol_seen;
5037 {
5038   Lisp_Object eol_type;
5039
5040   eol_type = CODING_ID_EOL_TYPE (coding->id);
5041   if (eol_seen & EOL_SEEN_LF)
5042     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5043   else if (eol_type & EOL_SEEN_CRLF)
5044     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5045   else if (eol_type & EOL_SEEN_CR)
5046     coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5047 }
5048
5049 /* Detect how a text specified in CODING is encoded.  If a coding
5050    system is detected, update fields of CODING by the detected coding
5051    system.  */
5052
5053 void
5054 detect_coding (coding)
5055      struct coding_system *coding;
5056 {
5057   unsigned char *src, *src_end;
5058   Lisp_Object attrs, coding_type;
5059
5060   coding->consumed = coding->consumed_char = 0;
5061   coding->produced = coding->produced_char = 0;
5062   coding_set_source (coding);
5063
5064   src_end = coding->source + coding->src_bytes;
5065
5066   /* If we have not yet decided the text encoding type, detect it
5067      now.  */
5068   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5069     {
5070       int mask = CATEGORY_MASK_ANY;
5071       int c, i;
5072
5073       for (src = coding->source; src < src_end; src++)
5074         {
5075           c = *src;
5076           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5077                                         || c == ISO_CODE_SI
5078                                         || c == ISO_CODE_SO)))
5079             break;
5080         }
5081       coding->head_ascii = src - (coding->source + coding->consumed);
5082
5083       if (coding->head_ascii < coding->src_bytes)
5084         {
5085           int detected = 0;
5086
5087           for (i = 0; i < coding_category_raw_text; i++)
5088             {
5089               enum coding_category category = coding_priorities[i];
5090               struct coding_system *this = coding_categories + category;
5091
5092               if (category >= coding_category_raw_text
5093                   || detected & (1 << category))
5094                 continue;
5095
5096               if (this->id < 0)
5097                 {
5098                   /* No coding system of this category is defined.  */
5099                   mask &= ~(1 << category);
5100                 }
5101               else
5102                 {
5103                   detected |= detected_mask[category];
5104                   if ((*(this->detector)) (coding, &mask))
5105                     break;
5106                 }
5107             }
5108           if (! mask)
5109             setup_coding_system (Qraw_text, coding);
5110           else if (mask != CATEGORY_MASK_ANY)
5111             for (i = 0; i < coding_category_raw_text; i++)
5112               {
5113                 enum coding_category category = coding_priorities[i];
5114                 struct coding_system *this = coding_categories + category;
5115
5116                 if (mask & (1 << category))
5117                   {
5118                     setup_coding_system (CODING_ID_NAME (this->id), coding);
5119                     break;
5120                   }
5121               }
5122         }
5123     }
5124
5125   attrs = CODING_ID_ATTRS (coding->id);
5126   coding_type = CODING_ATTR_TYPE (attrs);
5127
5128   /* If we have not yet decided the EOL type, detect it now.  But, the
5129      detection is impossible for a CCL based coding system, in which
5130      case, we detct the EOL type after decoding.  */
5131   if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5132       && ! EQ (coding_type, Qccl))
5133     {
5134       int eol_seen = detect_eol (coding, coding->source, coding->src_bytes);
5135
5136       if (eol_seen != EOL_SEEN_NONE)
5137         adjust_coding_eol_type (coding, eol_seen);
5138     }
5139 }
5140
5141
5142 static void
5143 decode_eol (coding)
5144      struct coding_system *coding;
5145 {
5146   if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5147     {
5148       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5149       unsigned char *pend = p + coding->produced;
5150       int eol_seen = EOL_SEEN_NONE;
5151
5152       for (; p < pend; p++)
5153         {
5154           if (*p == '\n')
5155             eol_seen |= EOL_SEEN_LF;
5156           else if (*p == '\r')
5157             {
5158               if (p + 1 < pend && *(p + 1) == '\n')
5159                 {
5160                   eol_seen |= EOL_SEEN_CRLF;
5161                   p++;
5162                 }
5163               else
5164                 eol_seen |= EOL_SEEN_CR;
5165             }
5166         }
5167       if (eol_seen != EOL_SEEN_NONE)
5168         adjust_coding_eol_type (coding, eol_seen);
5169     }
5170
5171   if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5172     {
5173       unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5174       unsigned char *pend = p + coding->produced;
5175
5176       for (; p < pend; p++)
5177         if (*p == '\r')
5178           *p = '\n';
5179     }
5180   else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5181     {
5182       unsigned char *p, *pbeg, *pend;
5183       Lisp_Object undo_list;
5184
5185       move_gap_both (coding->dst_pos + coding->produced_char,
5186                      coding->dst_pos_byte + coding->produced);
5187       undo_list = current_buffer->undo_list;
5188       current_buffer->undo_list = Qt;
5189       del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, Qnil);
5190       current_buffer->undo_list = undo_list;
5191       pbeg = GPT_ADDR;
5192       pend = pbeg + coding->produced;
5193
5194       for (p = pend - 1; p >= pbeg; p--)
5195         if (*p == '\r')
5196           {
5197             safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5198             pend--;
5199           }
5200       coding->produced_char -= coding->produced - (pend - pbeg);
5201       coding->produced = pend - pbeg;
5202       insert_from_gap (coding->produced_char, coding->produced);
5203     }
5204 }
5205
5206 static void
5207 translate_chars (coding, table)
5208      struct coding_system *coding;
5209      Lisp_Object table;
5210 {
5211   int *charbuf = coding->charbuf;
5212   int *charbuf_end = charbuf + coding->charbuf_used;
5213   int c;
5214
5215   if (coding->chars_at_source)
5216     return;
5217
5218   while (charbuf < charbuf_end)
5219     {
5220       c = *charbuf;
5221       if (c < 0)
5222         charbuf += c;
5223       else
5224         *charbuf++ = translate_char (table, c);
5225     }
5226 }
5227
5228 static int
5229 produce_chars (coding)
5230      struct coding_system *coding;
5231 {
5232   unsigned char *dst = coding->destination + coding->produced;
5233   unsigned char *dst_end = coding->destination + coding->dst_bytes;
5234   int produced;
5235   int produced_chars = 0;
5236
5237   if (! coding->chars_at_source)
5238     {
5239       /* Characters are in coding->charbuf.  */
5240       int *buf = coding->charbuf;
5241       int *buf_end = buf + coding->charbuf_used;
5242       unsigned char *adjusted_dst_end;
5243
5244       if (BUFFERP (coding->src_object)
5245           && EQ (coding->src_object, coding->dst_object))
5246         dst_end = coding->source + coding->consumed;
5247       adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5248
5249       while (buf < buf_end)
5250         {
5251           int c = *buf++;
5252
5253           if (dst >= adjusted_dst_end)
5254             {
5255               dst = alloc_destination (coding,
5256                                        buf_end - buf + MAX_MULTIBYTE_LENGTH,
5257                                        dst);
5258               dst_end = coding->destination + coding->dst_bytes;
5259               adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5260             }
5261           if (c >= 0)
5262             {
5263               if (coding->dst_multibyte
5264                   || ! CHAR_BYTE8_P (c))
5265                 CHAR_STRING_ADVANCE (c, dst);
5266               else
5267                 *dst++ = CHAR_TO_BYTE8 (c);
5268               produced_chars++;
5269             }
5270           else
5271             /* This is an annotation data.  */
5272             buf -= c + 1;
5273         }
5274     }
5275   else
5276     {
5277       unsigned char *src = coding->source;
5278       unsigned char *src_end = src + coding->src_bytes;
5279       Lisp_Object eol_type;
5280
5281       eol_type = CODING_ID_EOL_TYPE (coding->id);
5282
5283       if (coding->src_multibyte != coding->dst_multibyte)
5284         {
5285           if (coding->src_multibyte)
5286             {
5287               int multibytep = 1;
5288               int consumed_chars;
5289
5290               while (1)
5291                 {
5292                   unsigned char *src_base = src;
5293                   int c;
5294
5295                   ONE_MORE_BYTE (c);
5296                   if (c == '\r')
5297                     {
5298                       if (EQ (eol_type, Qdos))
5299                         {
5300                           if (src < src_end
5301                               && *src == '\n')
5302                             c = *src++;
5303                         }
5304                       else if (EQ (eol_type, Qmac))
5305                         c = '\n';
5306                     }
5307                   if (dst == dst_end)
5308                     {
5309                       coding->consumed = src - coding->source;
5310
5311                     if (EQ (coding->src_object, coding->dst_object))
5312                       dst_end = src;
5313                     if (dst == dst_end)
5314                       {
5315                         dst = alloc_destination (coding, src_end - src + 1,
5316                                                  dst);
5317                         dst_end = coding->destination + coding->dst_bytes;
5318                         coding_set_source (coding);
5319                         src = coding->source + coding->consumed;
5320                         src_end = coding->source + coding->src_bytes;
5321                       }
5322                     }
5323                   *dst++ = c;
5324                   produced_chars++;
5325                 }
5326             no_more_source:
5327               ;
5328             }
5329           else
5330             while (src < src_end)
5331               {
5332                 int multibytep = 1;
5333                 int c = *src++;
5334
5335                 if (c == '\r')
5336                   {
5337                     if (EQ (eol_type, Qdos))
5338                       {
5339                         if (src < src_end
5340                             && *src == '\n')
5341                           c = *src++;
5342                       }
5343                     else if (EQ (eol_type, Qmac))
5344                       c = '\n';
5345                   }
5346                 if (dst >= dst_end - 1)
5347                   {
5348                     coding->consumed = src - coding->source;
5349
5350                     if (EQ (coding->src_object, coding->dst_object))
5351                       dst_end = src;
5352                     if (dst >= dst_end - 1)
5353                       {
5354                         dst = alloc_destination (coding, src_end - src + 2,
5355                                                  dst);
5356                         dst_end = coding->destination + coding->dst_bytes;
5357                         coding_set_source (coding);
5358                         src = coding->source + coding->consumed;
5359                         src_end = coding->source + coding->src_bytes;
5360                       }
5361                   }
5362                 EMIT_ONE_BYTE (c);
5363               }
5364         }
5365       else
5366         {
5367           if (!EQ (coding->src_object, coding->dst_object))
5368             {
5369               int require = coding->src_bytes - coding->dst_bytes;
5370
5371               if (require > 0)
5372                 {
5373                   EMACS_INT offset = src - coding->source;
5374
5375                   dst = alloc_destination (coding, require, dst);
5376                   coding_set_source (coding);
5377                   src = coding->source + offset;
5378                   src_end = coding->source + coding->src_bytes;
5379                 }
5380             }
5381           produced_chars = coding->src_chars;
5382           while (src < src_end)
5383             {
5384               int c = *src++;
5385
5386               if (c == '\r')
5387                 {
5388                   if (EQ (eol_type, Qdos))
5389                     {
5390                       if (src < src_end
5391                           && *src == '\n')
5392                         c = *src++;
5393                       produced_chars--;
5394                     }
5395                   else if (EQ (eol_type, Qmac))
5396                     c = '\n';
5397                 }
5398               *dst++ = c;
5399             }
5400         }
5401       coding->consumed = coding->src_bytes;
5402       coding->consumed_char = coding->src_chars;
5403     }
5404
5405   produced = dst - (coding->destination + coding->produced);
5406   if (BUFFERP (coding->dst_object))
5407     insert_from_gap (produced_chars, produced);
5408   coding->produced += produced;
5409   coding->produced_char += produced_chars;
5410   return produced_chars;
5411 }
5412
5413 /* [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN ]
5414         or
5415    [ -LENGTH CHAR_POS_OFFSET MASK METHOD COMP_LEN COMPONENTS... ]
5416  */
5417
5418 static INLINE void
5419 produce_composition (coding, charbuf)
5420      struct coding_system *coding;
5421      int *charbuf;
5422 {
5423   Lisp_Object buffer;
5424   int len;
5425   EMACS_INT pos;
5426   enum composition_method method;
5427   int cmp_len;
5428   Lisp_Object components;
5429
5430   buffer = coding->dst_object;
5431   len = -charbuf[0];
5432   pos = coding->dst_pos + charbuf[1];
5433   method = (enum composition_method) (charbuf[3]);
5434   cmp_len = charbuf[4];
5435
5436   if (method == COMPOSITION_RELATIVE)
5437     components = Qnil;
5438   else
5439     {
5440       Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5441       int i;
5442
5443       len -= 5;
5444       charbuf += 5;
5445       for (i = 0; i < len; i++)
5446         args[i] = make_number (charbuf[i]);
5447       components = (method == COMPOSITION_WITH_ALTCHARS
5448                     ? Fstring (len, args) : Fvector (len, args));
5449     }
5450   compose_text (pos, pos + cmp_len, components, Qnil, Qnil);
5451 }
5452
5453 static int *
5454 save_composition_data (buf, buf_end, prop)
5455      int *buf, *buf_end;
5456      Lisp_Object prop;
5457 {
5458   enum composition_method method = COMPOSITION_METHOD (prop);
5459   int cmp_len = COMPOSITION_LENGTH (prop);
5460
5461   if (buf + 4 + (MAX_COMPOSITION_COMPONENTS * 2 - 1) > buf_end)
5462     return NULL;
5463
5464   buf[1] = CODING_ANNOTATE_COMPOSITION_MASK;
5465   buf[2] = method;
5466   buf[3] = cmp_len;
5467
5468   if (method == COMPOSITION_RELATIVE)
5469     buf[0] = 4;
5470   else
5471     {
5472       Lisp_Object components;
5473       int len, i;
5474
5475       components = COMPOSITION_COMPONENTS (prop);
5476       if (VECTORP (components))
5477         {
5478           len = XVECTOR (components)->size;
5479           for (i = 0; i < len; i++)
5480             buf[4 + i] = XINT (AREF (components, i));
5481         }
5482       else if (STRINGP (components))
5483         {
5484           int i_byte;
5485
5486           len = XSTRING (components)->size;
5487           i = i_byte = 0;
5488           while (i < len)
5489             FETCH_STRING_CHAR_ADVANCE (buf[4 + i], components, i, i_byte);
5490         }
5491       else if (INTEGERP (components))
5492         {
5493           len = 1;
5494           buf[4] = XINT (components);
5495         }
5496       else if (CONSP (components))
5497         {
5498           for (len = 0; CONSP (components);
5499                len++, components = XCDR (components))
5500             buf[4 + len] = XINT (XCAR (components));
5501         }
5502       else
5503         abort ();
5504       buf[0] = 4 + len;
5505     }
5506   return (buf + buf[0]);
5507 }
5508
5509 #define CHARBUF_SIZE 0x4000
5510
5511 #define ALLOC_CONVERSION_WORK_AREA(coding)                              \
5512   do {                                                                  \
5513     int size = CHARBUF_SIZE;;                                           \
5514                                                                         \
5515     coding->charbuf = NULL;                                             \
5516     while (size > 1024)                                                 \
5517       {                                                                 \
5518         coding->charbuf = (int *) alloca (sizeof (int) * size);         \
5519         if (coding->charbuf)                                            \
5520           break;                                                        \
5521         size >>= 1;                                                     \
5522       }                                                                 \
5523     if (! coding->charbuf)                                              \
5524       {                                                                 \
5525         coding->result = CODING_RESULT_INSUFFICIENT_MEM;                \
5526         return coding->result;                                          \
5527       }                                                                 \
5528     coding->charbuf_size = size;                                        \
5529   } while (0)
5530
5531
5532 static void
5533 produce_annotation (coding)
5534      struct coding_system *coding;
5535 {
5536   int *charbuf = coding->charbuf;
5537   int *charbuf_end = charbuf + coding->charbuf_used;
5538
5539   while (charbuf < charbuf_end)
5540     {
5541       if (*charbuf >= 0)
5542         charbuf++;
5543       else
5544         {
5545           int len = -*charbuf;
5546           switch (charbuf[2])
5547             {
5548             case CODING_ANNOTATE_COMPOSITION_MASK:
5549               produce_composition (coding, charbuf);
5550               break;
5551             default:
5552               abort ();
5553             }
5554           charbuf += len;
5555         }
5556     }
5557 }
5558
5559 /* Decode the data at CODING->src_object into CODING->dst_object.
5560    CODING->src_object is a buffer, a string, or nil.
5561    CODING->dst_object is a buffer.
5562
5563    If CODING->src_object is a buffer, it must be the current buffer.
5564    In this case, if CODING->src_pos is positive, it is a position of
5565    the source text in the buffer, otherwise, the source text is in the
5566    gap area of the buffer, and CODING->src_pos specifies the offset of
5567    the text from GPT (which must be the same as PT).  If this is the
5568    same buffer as CODING->dst_object, CODING->src_pos must be
5569    negative.
5570
5571    If CODING->src_object is a string, CODING->src_pos in an index to
5572    that string.
5573
5574    If CODING->src_object is nil, CODING->source must already point to
5575    the non-relocatable memory area.  In this case, CODING->src_pos is
5576    an offset from CODING->source.
5577
5578    The decoded data is inserted at the current point of the buffer
5579    CODING->dst_object.
5580 */
5581
5582 static int
5583 decode_coding (coding)
5584      struct coding_system *coding;
5585 {
5586   Lisp_Object attrs;
5587
5588   if (BUFFERP (coding->src_object)
5589       && coding->src_pos > 0
5590       && coding->src_pos < GPT
5591       && coding->src_pos + coding->src_chars > GPT)
5592     move_gap_both (coding->src_pos, coding->src_pos_byte);
5593
5594   if (BUFFERP (coding->dst_object))
5595     {
5596       if (current_buffer != XBUFFER (coding->dst_object))
5597         set_buffer_internal (XBUFFER (coding->dst_object));
5598       if (GPT != PT)
5599         move_gap_both (PT, PT_BYTE);
5600     }
5601
5602   coding->consumed = coding->consumed_char = 0;
5603   coding->produced = coding->produced_char = 0;
5604   coding->chars_at_source = 0;
5605   coding->result = CODING_RESULT_SUCCESS;
5606   coding->errors = 0;
5607
5608   ALLOC_CONVERSION_WORK_AREA (coding);
5609
5610   attrs = CODING_ID_ATTRS (coding->id);
5611
5612   do
5613     {
5614       coding_set_source (coding);
5615       coding->annotated = 0;
5616       (*(coding->decoder)) (coding);
5617       if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5618         translate_chars (CODING_ATTR_DECODE_TBL (attrs), coding);
5619       coding_set_destination (coding);
5620       produce_chars (coding);
5621       if (coding->annotated)
5622         produce_annotation (coding);
5623     }
5624   while (coding->consumed < coding->src_bytes
5625          && ! coding->result);
5626
5627   if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5628       && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5629       && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5630     decode_eol (coding);
5631
5632   coding->carryover_bytes = 0;
5633   if (coding->consumed < coding->src_bytes)
5634     {
5635       int nbytes = coding->src_bytes - coding->consumed;
5636       unsigned char *src;
5637
5638       coding_set_source (coding);
5639       coding_set_destination (coding);
5640       src = coding->source + coding->consumed;
5641
5642       if (coding->mode & CODING_MODE_LAST_BLOCK)
5643         {
5644           /* Flush out unprocessed data as binary chars.  We are sure
5645              that the number of data is less than the size of
5646              coding->charbuf.  */
5647           int *charbuf = coding->charbuf;
5648
5649           while (nbytes-- > 0)
5650             {
5651               int c = *src++;
5652               *charbuf++ =  (c & 0x80 ? - c : c);
5653             }
5654           produce_chars (coding);
5655         }
5656       else
5657         {
5658           /* Record unprocessed bytes in coding->carryover.  We are
5659              sure that the number of data is less than the size of
5660              coding->carryover.  */
5661           unsigned char *p = coding->carryover;
5662
5663           coding->carryover_bytes = nbytes;
5664           while (nbytes-- > 0)
5665             *p++ = *src++;
5666         }
5667       coding->consumed = coding->src_bytes;
5668     }
5669
5670   return coding->result;
5671 }
5672
5673 static void
5674 consume_chars (coding)
5675      struct coding_system *coding;
5676 {
5677   int *buf = coding->charbuf;
5678   /* -1 is to compensate for CRLF.  */
5679   int *buf_end = coding->charbuf + coding->charbuf_size - 1;
5680   unsigned char *src = coding->source + coding->consumed;
5681   int pos = coding->src_pos + coding->consumed_char;
5682   int end_pos = coding->src_pos + coding->src_chars;
5683   int multibytep = coding->src_multibyte;
5684   Lisp_Object eol_type;
5685   int c;
5686   int start, end, stop;
5687   Lisp_Object object, prop;
5688
5689   eol_type = CODING_ID_EOL_TYPE (coding->id);
5690   if (VECTORP (eol_type))
5691     eol_type = Qunix;
5692
5693   object = coding->src_object;
5694
5695   /* Note: composition handling is not yet implemented.  */
5696   coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
5697
5698   if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK
5699       && find_composition (pos, end_pos, &start, &end, &prop, object)
5700       && end <= end_pos
5701       && (start >= pos
5702           || (find_composition (end, end_pos, &start, &end, &prop, object)
5703               && end <= end_pos)))
5704     stop = start;
5705   else
5706     stop = end_pos;
5707
5708   while (buf < buf_end)
5709     {
5710       if (pos == stop)
5711         {
5712           int *p;
5713
5714           if (pos == end_pos)
5715             break;
5716           p = save_composition_data (buf, buf_end, prop);
5717           if (p == NULL)
5718             break;
5719           buf = p;
5720           if (find_composition (end, end_pos, &start, &end, &prop, object)
5721               && end <= end_pos)
5722             stop = start;
5723           else
5724             stop = end_pos;
5725         }
5726
5727       if (! multibytep)
5728         c = *src++;
5729       else
5730         c = STRING_CHAR_ADVANCE (src);
5731       if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
5732         c = '\n';
5733       if (! EQ (eol_type, Qunix))
5734         {
5735           if (c == '\n')
5736             {
5737               if (EQ (eol_type, Qdos))
5738                 *buf++ = '\r';
5739               else
5740                 c = '\r';
5741             }
5742         }
5743       *buf++ = c;
5744       pos++;
5745     }
5746
5747   coding->consumed = src - coding->source;
5748   coding->consumed_char = pos - coding->src_pos;
5749   coding->charbuf_used = buf - coding->charbuf;
5750   coding->chars_at_source = 0;
5751 }
5752
5753
5754 /* Encode the text at CODING->src_object into CODING->dst_object.
5755    CODING->src_object is a buffer or a string.
5756    CODING->dst_object is a buffer or nil.
5757
5758    If CODING->src_object is a buffer, it must be the current buffer.
5759    In this case, if CODING->src_pos is positive, it is a position of
5760    the source text in the buffer, otherwise. the source text is in the
5761    gap area of the buffer, and coding->src_pos specifies the offset of
5762    the text from GPT (which must be the same as PT).  If this is the
5763    same buffer as CODING->dst_object, CODING->src_pos must be
5764    negative and CODING should not have `pre-write-conversion'.
5765
5766    If CODING->src_object is a string, CODING should not have
5767    `pre-write-conversion'.
5768
5769    If CODING->dst_object is a buffer, the encoded data is inserted at
5770    the current point of that buffer.
5771
5772    If CODING->dst_object is nil, the encoded data is placed at the
5773    memory area specified by CODING->destination.  */
5774
5775 static int
5776 encode_coding (coding)
5777      struct coding_system *coding;
5778 {
5779   Lisp_Object attrs;
5780
5781   attrs = CODING_ID_ATTRS (coding->id);
5782
5783   if (BUFFERP (coding->dst_object))
5784     {
5785       set_buffer_internal (XBUFFER (coding->dst_object));
5786       coding->dst_multibyte
5787         = ! NILP (current_buffer->enable_multibyte_characters);
5788     }
5789
5790   coding->consumed = coding->consumed_char = 0;
5791   coding->produced = coding->produced_char = 0;
5792   coding->result = CODING_RESULT_SUCCESS;
5793   coding->errors = 0;
5794
5795   ALLOC_CONVERSION_WORK_AREA (coding);
5796
5797   do {
5798     coding_set_source (coding);
5799     consume_chars (coding);
5800
5801     if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
5802       translate_chars (CODING_ATTR_ENCODE_TBL (attrs), coding);
5803
5804     coding_set_destination (coding);
5805     (*(coding->encoder)) (coding);
5806   } while (coding->consumed_char < coding->src_chars);
5807
5808   if (BUFFERP (coding->dst_object))
5809     insert_from_gap (coding->produced_char, coding->produced);
5810
5811   return (coding->result);
5812 }
5813
5814 /* Work buffer */
5815
5816 /* List of currently used working buffer.  */
5817 Lisp_Object Vcode_conversion_work_buf_list;
5818
5819 /* A working buffer used by the top level conversion.  */
5820 Lisp_Object Vcode_conversion_reused_work_buf;
5821
5822
5823 /* Return a working buffer that can be freely used by the following
5824    code conversion.  MULTIBYTEP specifies the multibyteness of the
5825    buffer.  */
5826
5827 Lisp_Object
5828 make_conversion_work_buffer (multibytep)
5829      int multibytep;
5830 {
5831   struct buffer *current = current_buffer;
5832   Lisp_Object buf;
5833
5834   if (NILP (Vcode_conversion_work_buf_list))
5835     {
5836       if (NILP (Vcode_conversion_reused_work_buf))
5837         Vcode_conversion_reused_work_buf
5838           = Fget_buffer_create (build_string (" *code-conversion-work*"));
5839       Vcode_conversion_work_buf_list
5840         = Fcons (Vcode_conversion_reused_work_buf, Qnil);
5841     }
5842   else
5843     {
5844       int depth = Flength (Vcode_conversion_work_buf_list);
5845       char str[128];
5846
5847       sprintf (str, " *code-conversion-work*<%d>", depth);
5848       Vcode_conversion_work_buf_list
5849         = Fcons (Fget_buffer_create (build_string (str)),
5850                  Vcode_conversion_work_buf_list);
5851     }
5852
5853   buf = XCAR (Vcode_conversion_work_buf_list);
5854   set_buffer_internal (XBUFFER (buf));
5855   current_buffer->undo_list = Qt;
5856   Ferase_buffer ();
5857   Fset_buffer_multibyte (multibytep ? Qt : Qnil);
5858   set_buffer_internal (current);
5859   return buf;
5860 }
5861
5862 static struct coding_system *saved_coding;
5863
5864 Lisp_Object
5865 code_conversion_restore (info)
5866      Lisp_Object info;
5867 {
5868   int depth = Flength (Vcode_conversion_work_buf_list);
5869   Lisp_Object buf;
5870
5871   if (depth > 0)
5872     {
5873       buf = XCAR (Vcode_conversion_work_buf_list);
5874       Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
5875       if (depth > 1 && !NILP (Fbuffer_live_p (buf)))
5876         Fkill_buffer (buf);
5877     }
5878
5879   if (saved_coding->dst_object == Qt
5880       && saved_coding->destination)
5881     xfree (saved_coding->destination);
5882
5883   return save_excursion_restore (info);
5884 }
5885
5886
5887 int
5888 decode_coding_gap (coding, chars, bytes)
5889      struct coding_system *coding;
5890      EMACS_INT chars, bytes;
5891 {
5892   int count = specpdl_ptr - specpdl;
5893
5894   saved_coding = coding;
5895   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5896
5897   coding->src_object = Fcurrent_buffer ();
5898   coding->src_chars = chars;
5899   coding->src_bytes = bytes;
5900   coding->src_pos = -chars;
5901   coding->src_pos_byte = -bytes;
5902   coding->src_multibyte = chars < bytes;
5903   coding->dst_object = coding->src_object;
5904   coding->dst_pos = PT;
5905   coding->dst_pos_byte = PT_BYTE;
5906   coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
5907
5908   if (CODING_REQUIRE_DETECTION (coding))
5909     detect_coding (coding);
5910
5911   decode_coding (coding);
5912
5913   unbind_to (count, Qnil);
5914   return coding->result;
5915 }
5916
5917 int
5918 encode_coding_gap (coding, chars, bytes)
5919      struct coding_system *coding;
5920      EMACS_INT chars, bytes;
5921 {
5922   int count = specpdl_ptr - specpdl;
5923   Lisp_Object buffer;
5924
5925   saved_coding = coding;
5926   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5927
5928   buffer = Fcurrent_buffer ();
5929   coding->src_object = buffer;
5930   coding->src_chars = chars;
5931   coding->src_bytes = bytes;
5932   coding->src_pos = -chars;
5933   coding->src_pos_byte = -bytes;
5934   coding->src_multibyte = chars < bytes;
5935   coding->dst_object = coding->src_object;
5936   coding->dst_pos = PT;
5937   coding->dst_pos_byte = PT_BYTE;
5938
5939   encode_coding (coding);
5940
5941   unbind_to (count, Qnil);
5942   return coding->result;
5943 }
5944
5945
5946 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
5947    SRC_OBJECT into DST_OBJECT by coding context CODING.
5948
5949    SRC_OBJECT is a buffer, a string, or Qnil.
5950
5951    If it is a buffer, the text is at point of the buffer.  FROM and TO
5952    are positions in the buffer.
5953
5954    If it is a string, the text is at the beginning of the string.
5955    FROM and TO are indices to the string.
5956
5957    If it is nil, the text is at coding->source.  FROM and TO are
5958    indices to coding->source.
5959
5960    DST_OBJECT is a buffer, Qt, or Qnil.
5961
5962    If it is a buffer, the decoded text is inserted at point of the
5963    buffer.  If the buffer is the same as SRC_OBJECT, the source text
5964    is deleted.
5965
5966    If it is Qt, a string is made from the decoded text, and
5967    set in CODING->dst_object.
5968
5969    If it is Qnil, the decoded text is stored at CODING->destination.
5970    The called must allocate CODING->dst_bytes bytes at
5971    CODING->destination by xmalloc.  If the decoded text is longer than
5972    CODING->dst_bytes, CODING->destination is relocated by xrealloc.
5973  */
5974
5975 void
5976 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
5977                       dst_object)
5978      struct coding_system *coding;
5979      Lisp_Object src_object;
5980      EMACS_INT from, from_byte, to, to_byte;
5981      Lisp_Object dst_object;
5982 {
5983   int count = specpdl_ptr - specpdl;
5984   unsigned char *destination;
5985   EMACS_INT dst_bytes;
5986   EMACS_INT chars = to - from;
5987   EMACS_INT bytes = to_byte - from_byte;
5988   Lisp_Object attrs;
5989
5990   saved_coding = coding;
5991   record_unwind_protect (code_conversion_restore, save_excursion_save ());
5992
5993   if (NILP (dst_object))
5994     {
5995       destination = coding->destination;
5996       dst_bytes = coding->dst_bytes;
5997     }
5998
5999   coding->src_object = src_object;
6000   coding->src_chars = chars;
6001   coding->src_bytes = bytes;
6002   coding->src_multibyte = chars < bytes;
6003
6004   if (STRINGP (src_object))
6005     {
6006       coding->src_pos = from;
6007       coding->src_pos_byte = from_byte;
6008     }
6009   else if (BUFFERP (src_object))
6010     {
6011       set_buffer_internal (XBUFFER (src_object));
6012       if (from != GPT)
6013         move_gap_both (from, from_byte);
6014       if (EQ (src_object, dst_object))
6015         {
6016           TEMP_SET_PT_BOTH (from, from_byte);
6017           del_range_both (from, from_byte, to, to_byte, 1);
6018           coding->src_pos = -chars;
6019           coding->src_pos_byte = -bytes;
6020         }
6021       else
6022         {
6023           coding->src_pos = from;
6024           coding->src_pos_byte = from_byte;
6025         }
6026     }
6027
6028   if (CODING_REQUIRE_DETECTION (coding))
6029     detect_coding (coding);
6030   attrs = CODING_ID_ATTRS (coding->id);
6031
6032   if (! NILP (CODING_ATTR_POST_READ (attrs))
6033       || EQ (dst_object, Qt))
6034     {
6035       coding->dst_object = make_conversion_work_buffer (1);
6036       coding->dst_pos = BEG;
6037       coding->dst_pos_byte = BEG_BYTE;
6038       coding->dst_multibyte = 1;
6039     }
6040   else if (BUFFERP (dst_object))
6041     {
6042       coding->dst_object = dst_object;
6043       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6044       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6045       coding->dst_multibyte
6046         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6047     }
6048   else
6049     {
6050       coding->dst_object = Qnil;
6051       coding->dst_multibyte = 1;
6052     }
6053
6054   decode_coding (coding);
6055
6056   if (BUFFERP (coding->dst_object))
6057     set_buffer_internal (XBUFFER (coding->dst_object));
6058
6059   if (! NILP (CODING_ATTR_POST_READ (attrs)))
6060     {
6061       struct gcpro gcpro1, gcpro2;
6062       EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6063       Lisp_Object val;
6064
6065       TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6066       GCPRO2 (coding->src_object, coding->dst_object);
6067       val = call1 (CODING_ATTR_POST_READ (attrs),
6068                    make_number (coding->produced_char));
6069       UNGCPRO;
6070       CHECK_NATNUM (val);
6071       coding->produced_char += Z - prev_Z;
6072       coding->produced += Z_BYTE - prev_Z_BYTE;
6073     }
6074
6075   if (EQ (dst_object, Qt))
6076     {
6077       coding->dst_object = Fbuffer_string ();
6078     }
6079   else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6080     {
6081       set_buffer_internal (XBUFFER (coding->dst_object));
6082       if (dst_bytes < coding->produced)
6083         {
6084           destination
6085             = (unsigned char *) xrealloc (destination, coding->produced);
6086           if (! destination)
6087             {
6088               coding->result = CODING_RESULT_INSUFFICIENT_DST;
6089               unbind_to (count, Qnil);
6090               return;
6091             }
6092           if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6093             move_gap_both (BEGV, BEGV_BYTE);
6094           bcopy (BEGV_ADDR, destination, coding->produced);
6095           coding->destination = destination;
6096         }
6097     }
6098
6099   unbind_to (count, Qnil);
6100 }
6101
6102
6103 void
6104 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6105                       dst_object)
6106      struct coding_system *coding;
6107      Lisp_Object src_object;
6108      EMACS_INT from, from_byte, to, to_byte;
6109      Lisp_Object dst_object;
6110 {
6111   int count = specpdl_ptr - specpdl;
6112   EMACS_INT chars = to - from;
6113   EMACS_INT bytes = to_byte - from_byte;
6114   Lisp_Object attrs;
6115
6116   saved_coding = coding;
6117   record_unwind_protect (code_conversion_restore, save_excursion_save ());
6118
6119   coding->src_object = src_object;
6120   coding->src_chars = chars;
6121   coding->src_bytes = bytes;
6122   coding->src_multibyte = chars < bytes;
6123
6124   attrs = CODING_ID_ATTRS (coding->id);
6125
6126   if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6127     {
6128       coding->src_object = make_conversion_work_buffer (coding->src_multibyte);
6129       set_buffer_internal (XBUFFER (coding->src_object));
6130       if (STRINGP (src_object))
6131         insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6132       else if (BUFFERP (src_object))
6133         insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6134       else
6135         insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6136
6137       if (EQ (src_object, dst_object))
6138         {
6139           set_buffer_internal (XBUFFER (src_object));
6140           del_range_both (from, from_byte, to, to_byte, 1);
6141           set_buffer_internal (XBUFFER (coding->src_object));
6142         }
6143
6144       call2 (CODING_ATTR_PRE_WRITE (attrs),
6145              make_number (BEG), make_number (Z));
6146       coding->src_object = Fcurrent_buffer ();
6147       if (BEG != GPT)
6148         move_gap_both (BEG, BEG_BYTE);
6149       coding->src_chars = Z - BEG;
6150       coding->src_bytes = Z_BYTE - BEG_BYTE;
6151       coding->src_pos = BEG;
6152       coding->src_pos_byte = BEG_BYTE;
6153       coding->src_multibyte = Z < Z_BYTE;
6154     }
6155   else if (STRINGP (src_object))
6156     {
6157       coding->src_pos = from;
6158       coding->src_pos_byte = from_byte;
6159     }
6160   else if (BUFFERP (src_object))
6161     {
6162       set_buffer_internal (XBUFFER (src_object));
6163       if (from != GPT)
6164         move_gap_both (from, from_byte);
6165       if (EQ (src_object, dst_object))
6166         {
6167           del_range_both (from, from_byte, to, to_byte, 1);
6168           coding->src_pos = -chars;
6169           coding->src_pos_byte = -bytes;
6170         }
6171       else
6172         {
6173           coding->src_pos = from;
6174           coding->src_pos_byte = from_byte;
6175         }
6176     }
6177
6178   if (BUFFERP (dst_object))
6179     {
6180       coding->dst_object = dst_object;
6181       coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6182       coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6183       coding->dst_multibyte
6184         = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6185     }
6186   else if (EQ (dst_object, Qt))
6187     {
6188       coding->dst_object = Qnil;
6189       coding->dst_bytes = coding->src_chars;
6190       if (coding->dst_bytes == 0)
6191         coding->dst_bytes = 1;
6192       coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6193       coding->dst_multibyte = 0;
6194     }
6195   else
6196     {
6197       coding->dst_object = Qnil;
6198       coding->dst_multibyte = 0;
6199     }
6200
6201   encode_coding (coding);
6202
6203   if (EQ (dst_object, Qt))
6204     {
6205       if (BUFFERP (coding->dst_object))
6206         coding->dst_object = Fbuffer_string ();
6207       else
6208         {
6209           coding->dst_object
6210             = make_unibyte_string ((char *) coding->destination,
6211                                    coding->produced);
6212           xfree (coding->destination);
6213         }
6214     }
6215
6216   unbind_to (count, Qnil);
6217 }
6218
6219
6220 Lisp_Object
6221 preferred_coding_system ()
6222 {
6223   int id = coding_categories[coding_priorities[0]].id;
6224
6225   return CODING_ID_NAME (id);
6226 }
6227
6228 \f
6229 #ifdef emacs
6230 /*** 8. Emacs Lisp library functions ***/
6231
6232 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6233        doc: /* Return t if OBJECT is nil or a coding-system.
6234 See the documentation of `define-coding-system' for information
6235 about coding-system objects.  */)
6236      (obj)
6237      Lisp_Object obj;
6238 {
6239   return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6240 }
6241
6242 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6243        Sread_non_nil_coding_system, 1, 1, 0,
6244        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.  */)
6245      (prompt)
6246      Lisp_Object prompt;
6247 {
6248   Lisp_Object val;
6249   do
6250     {
6251       val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6252                               Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6253     }
6254   while (XSTRING (val)->size == 0);
6255   return (Fintern (val, Qnil));
6256 }
6257
6258 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6259        doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6260 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM.  */)
6261      (prompt, default_coding_system)
6262      Lisp_Object prompt, default_coding_system;
6263 {
6264   Lisp_Object val;
6265   if (SYMBOLP (default_coding_system))
6266     XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6267   val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6268                           Qt, Qnil, Qcoding_system_history,
6269                           default_coding_system, Qnil);
6270   return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6271 }
6272
6273 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6274        1, 1, 0,
6275        doc: /* Check validity of CODING-SYSTEM.
6276 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error.
6277 It is valid if it is a symbol with a non-nil `coding-system' property.
6278 The value of property should be a vector of length 5.  */)
6279   (coding_system)
6280      Lisp_Object coding_system;
6281 {
6282   CHECK_SYMBOL (coding_system);
6283   if (!NILP (Fcoding_system_p (coding_system)))
6284     return coding_system;
6285   while (1)
6286     Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6287 }
6288
6289 \f
6290 Lisp_Object
6291 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6292      unsigned char *src;
6293      int src_bytes, highest;
6294      int multibytep;
6295      Lisp_Object coding_system;
6296 {
6297   unsigned char *src_end = src + src_bytes;
6298   int mask = CATEGORY_MASK_ANY;
6299   int detected = 0;
6300   int c, i;
6301   Lisp_Object attrs, eol_type;
6302   Lisp_Object val;
6303   struct coding_system coding;
6304
6305   if (NILP (coding_system))
6306     coding_system = Qundecided;
6307   setup_coding_system (coding_system, &coding);
6308   attrs = CODING_ID_ATTRS (coding.id);
6309   eol_type = CODING_ID_EOL_TYPE (coding.id);
6310
6311   coding.source = src;
6312   coding.src_bytes = src_bytes;
6313   coding.src_multibyte = multibytep;
6314   coding.consumed = 0;
6315
6316   if (XINT (CODING_ATTR_CATEGORY (attrs)) != coding_category_undecided)
6317     {
6318       mask = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6319     }
6320   else
6321     {
6322       coding_system = Qnil;
6323       for (; src < src_end; src++)
6324         {
6325           c = *src;
6326           if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
6327                                         || c == ISO_CODE_SI
6328                                         || c == ISO_CODE_SO)))
6329             break;
6330         }
6331       coding.head_ascii = src - coding.source;
6332
6333       if (src < src_end)
6334         for (i = 0; i < coding_category_raw_text; i++)
6335           {
6336             enum coding_category category = coding_priorities[i];
6337             struct coding_system *this = coding_categories + category;
6338
6339             if (category >= coding_category_raw_text
6340                 || detected & (1 << category))
6341               continue;
6342
6343             if (this->id < 0)
6344               {
6345                 /* No coding system of this category is defined.  */
6346                 mask &= ~(1 << category);
6347               }
6348             else
6349               {
6350                 detected |= detected_mask[category];
6351                 if ((*(coding_categories[category].detector)) (&coding, &mask)
6352                     && highest)
6353                   {
6354                     mask &= detected_mask[category];
6355                     break;
6356                   }
6357               }
6358           }
6359     }
6360
6361   if (!mask)
6362     val = Fcons (make_number (coding_category_raw_text), Qnil);
6363   else if (mask == CATEGORY_MASK_ANY)
6364     val = Fcons (make_number (coding_category_undecided), Qnil);
6365   else if (highest)
6366     {
6367       for (i = 0; i < coding_category_raw_text; i++)
6368         if (mask & (1 << coding_priorities[i]))
6369           {
6370             val = Fcons (make_number (coding_priorities[i]), Qnil);
6371             break;
6372           }
6373     }
6374   else
6375     {
6376       val = Qnil;
6377       for (i = coding_category_raw_text - 1; i >= 0; i--)
6378         if (mask & (1 << coding_priorities[i]))
6379           val = Fcons (make_number (coding_priorities[i]), val);
6380     }
6381
6382   {
6383     int one_byte_eol = -1, two_byte_eol = -1;
6384     Lisp_Object tail;
6385
6386     for (tail = val; CONSP (tail); tail = XCDR (tail))
6387       {
6388         struct coding_system *this
6389           = (NILP (coding_system) ? coding_categories + XINT (XCAR (tail))
6390              : &coding);
6391         int this_eol;
6392
6393         attrs = CODING_ID_ATTRS (this->id);
6394         eol_type = CODING_ID_EOL_TYPE (this->id);
6395         XSETCAR (tail, CODING_ID_NAME (this->id));
6396         if (VECTORP (eol_type))
6397           {
6398             if (EQ (CODING_ATTR_TYPE (attrs), Qutf_16))
6399               {
6400                 if (two_byte_eol < 0)
6401                   two_byte_eol = detect_eol (this, coding.source, src_bytes);
6402                 this_eol = two_byte_eol;
6403               }
6404             else
6405               {
6406                 if (one_byte_eol < 0)
6407                   one_byte_eol =detect_eol (this, coding.source, src_bytes);
6408                 this_eol = one_byte_eol;
6409               }
6410             if (this_eol == EOL_SEEN_LF)
6411               XSETCAR (tail, AREF (eol_type, 0));
6412             else if (this_eol == EOL_SEEN_CRLF)
6413               XSETCAR (tail, AREF (eol_type, 1));
6414             else if (this_eol == EOL_SEEN_CR)
6415               XSETCAR (tail, AREF (eol_type, 2));
6416           }
6417       }
6418   }
6419
6420   return (highest ? XCAR (val) : val);
6421 }
6422
6423
6424 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6425        2, 3, 0,
6426        doc: /* Detect coding system of the text in the region between START and END.
6427 Return a list of possible coding systems ordered by priority.
6428
6429 If only ASCII characters are found, it returns a list of single element
6430 `undecided' or its subsidiary coding system according to a detected
6431 end-of-line format.
6432
6433 If optional argument HIGHEST is non-nil, return the coding system of
6434 highest priority.  */)
6435      (start, end, highest)
6436      Lisp_Object start, end, highest;
6437 {
6438   int from, to;
6439   int from_byte, to_byte;
6440
6441   CHECK_NUMBER_COERCE_MARKER (start);
6442   CHECK_NUMBER_COERCE_MARKER (end);
6443
6444   validate_region (&start, &end);
6445   from = XINT (start), to = XINT (end);
6446   from_byte = CHAR_TO_BYTE (from);
6447   to_byte = CHAR_TO_BYTE (to);
6448
6449   if (from < GPT && to >= GPT)
6450     move_gap_both (to, to_byte);
6451
6452   return detect_coding_system (BYTE_POS_ADDR (from_byte),
6453                                to_byte - from_byte,
6454                                !NILP (highest),
6455                                !NILP (current_buffer
6456                                       ->enable_multibyte_characters),
6457                                Qnil);
6458 }
6459
6460 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6461        1, 2, 0,
6462        doc: /* Detect coding system of the text in STRING.
6463 Return a list of possible coding systems ordered by priority.
6464
6465 If only ASCII characters are found, it returns a list of single element
6466 `undecided' or its subsidiary coding system according to a detected
6467 end-of-line format.
6468
6469 If optional argument HIGHEST is non-nil, return the coding system of
6470 highest priority.  */)
6471      (string, highest)
6472      Lisp_Object string, highest;
6473 {
6474   CHECK_STRING (string);
6475
6476   return detect_coding_system (XSTRING (string)->data,
6477                                STRING_BYTES (XSTRING (string)),
6478                                !NILP (highest),
6479                                STRING_MULTIBYTE (string),
6480                                Qnil);
6481 }
6482
6483
6484 static INLINE int
6485 char_encodable_p (c, attrs)
6486      int c;
6487      Lisp_Object attrs;
6488 {
6489   Lisp_Object tail;
6490   struct charset *charset;
6491
6492   for (tail = CODING_ATTR_CHARSET_LIST (attrs);
6493        CONSP (tail); tail = XCDR (tail))
6494     {
6495       charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
6496       if (CHAR_CHARSET_P (c, charset))
6497         break;
6498     }
6499   return (! NILP (tail));
6500 }
6501
6502
6503 /* Return a list of coding systems that safely encode the text between
6504    START and END.  If EXCLUDE is non-nil, it is a list of coding
6505    systems not to check.  The returned list doesn't contain any such
6506    coding systems.  In any case, If the text contains only ASCII or is
6507    unibyte, return t.  */
6508
6509 DEFUN ("find-coding-systems-region-internal",
6510        Ffind_coding_systems_region_internal,
6511        Sfind_coding_systems_region_internal, 2, 3, 0,
6512        doc: /* Internal use only.  */)
6513      (start, end, exclude)
6514      Lisp_Object start, end, exclude;
6515 {
6516   Lisp_Object coding_attrs_list, safe_codings;
6517   EMACS_INT start_byte, end_byte;
6518   unsigned char *p, *pbeg, *pend;
6519   int c;
6520   Lisp_Object tail, elt;
6521
6522   if (STRINGP (start))
6523     {
6524       if (!STRING_MULTIBYTE (start)
6525           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6526         return Qt;
6527       start_byte = 0;
6528       end_byte = STRING_BYTES (XSTRING (start));
6529     }
6530   else
6531     {
6532       CHECK_NUMBER_COERCE_MARKER (start);
6533       CHECK_NUMBER_COERCE_MARKER (end);
6534       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6535         args_out_of_range (start, end);
6536       if (NILP (current_buffer->enable_multibyte_characters))
6537         return Qt;
6538       start_byte = CHAR_TO_BYTE (XINT (start));
6539       end_byte = CHAR_TO_BYTE (XINT (end));
6540       if (XINT (end) - XINT (start) == end_byte - start_byte)
6541         return Qt;
6542
6543       if (start < GPT && end > GPT)
6544         {
6545           if ((GPT - start) < (end - GPT))
6546             move_gap_both (start, start_byte);
6547           else
6548             move_gap_both (end, end_byte);
6549         }
6550     }
6551
6552   coding_attrs_list = Qnil;
6553   for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
6554     if (NILP (exclude)
6555         || NILP (Fmemq (XCAR (tail), exclude)))
6556       {
6557         Lisp_Object attrs;
6558
6559         attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
6560         if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
6561             && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
6562           coding_attrs_list = Fcons (attrs, coding_attrs_list);
6563       }
6564
6565   if (STRINGP (start))
6566     p = pbeg = XSTRING (start)->data;
6567   else
6568     p = pbeg = BYTE_POS_ADDR (start_byte);
6569   pend = p + (end_byte - start_byte);
6570
6571   while (p < pend && ASCII_BYTE_P (*p)) p++;
6572   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6573
6574   while (p < pend)
6575     {
6576       if (ASCII_BYTE_P (*p))
6577         p++;
6578       else
6579         {
6580           c = STRING_CHAR_ADVANCE (p);
6581
6582           charset_map_loaded = 0;
6583           for (tail = coding_attrs_list; CONSP (tail);)
6584             {
6585               elt = XCAR (tail);
6586               if (NILP (elt))
6587                 tail = XCDR (tail);
6588               else if (char_encodable_p (c, elt))
6589                 tail = XCDR (tail);
6590               else if (CONSP (XCDR (tail)))
6591                 {
6592                   XSETCAR (tail, XCAR (XCDR (tail)));
6593                   XSETCDR (tail, XCDR (XCDR (tail)));
6594                 }
6595               else
6596                 {
6597                   XSETCAR (tail, Qnil);
6598                   tail = XCDR (tail);
6599                 }
6600             }
6601           if (charset_map_loaded)
6602             {
6603               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6604
6605               if (STRINGP (start))
6606                 pbeg = XSTRING (start)->data;
6607               else
6608                 pbeg = BYTE_POS_ADDR (start_byte);
6609               p = pbeg + p_offset;
6610               pend = pbeg + pend_offset;
6611             }
6612         }
6613     }
6614
6615   safe_codings = Qnil;
6616   for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
6617     if (! NILP (XCAR (tail)))
6618       safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
6619
6620   return safe_codings;
6621 }
6622
6623
6624 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
6625        Scheck_coding_systems_region, 3, 3, 0,
6626        doc: /* Check if the region is encodable by coding systems.
6627
6628 START and END are buffer positions specifying the region.
6629 CODING-SYSTEM-LIST is a list of coding systems to check.
6630
6631 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
6632 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
6633 whole region, POS0, POS1, ... are buffer positions where non-encodable
6634 characters are found.
6635
6636 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
6637 value is nil.
6638
6639 START may be a string.  In that case, check if the string is
6640 encodable, and the value contains indices to the string instead of
6641 buffer positions.  END is ignored.  */)
6642      (start, end, coding_system_list)
6643      Lisp_Object start, end, coding_system_list;
6644 {
6645   Lisp_Object list;
6646   EMACS_INT start_byte, end_byte;
6647   int pos;
6648   unsigned char *p, *pbeg, *pend;
6649   int c;
6650   Lisp_Object tail, elt;
6651
6652   if (STRINGP (start))
6653     {
6654       if (!STRING_MULTIBYTE (start)
6655           && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
6656         return Qnil;
6657       start_byte = 0;
6658       end_byte = STRING_BYTES (XSTRING (start));
6659       pos = 0;
6660     }
6661   else
6662     {
6663       CHECK_NUMBER_COERCE_MARKER (start);
6664       CHECK_NUMBER_COERCE_MARKER (end);
6665       if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
6666         args_out_of_range (start, end);
6667       if (NILP (current_buffer->enable_multibyte_characters))
6668         return Qnil;
6669       start_byte = CHAR_TO_BYTE (XINT (start));
6670       end_byte = CHAR_TO_BYTE (XINT (end));
6671       if (XINT (end) - XINT (start) == end_byte - start_byte)
6672         return Qt;
6673
6674       if (start < GPT && end > GPT)
6675         {
6676           if ((GPT - start) < (end - GPT))
6677             move_gap_both (start, start_byte);
6678           else
6679             move_gap_both (end, end_byte);
6680         }
6681       pos = start;
6682     }
6683
6684   list = Qnil;
6685   for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
6686     {
6687       elt = XCAR (tail);
6688       list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
6689                                        Qnil)),
6690                     list);
6691     }
6692
6693   if (STRINGP (start))
6694     p = pbeg = XSTRING (start)->data;
6695   else
6696     p = pbeg = BYTE_POS_ADDR (start_byte);
6697   pend = p + (end_byte - start_byte);
6698
6699   while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
6700   while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
6701
6702   while (p < pend)
6703     {
6704       if (ASCII_BYTE_P (*p))
6705         p++;
6706       else
6707         {
6708           c = STRING_CHAR_ADVANCE (p);
6709
6710           charset_map_loaded = 0;
6711           for (tail = list; CONSP (tail); tail = XCDR (tail))
6712             {
6713               elt = XCDR (XCAR (tail));
6714               if (! char_encodable_p (c, XCAR (elt)))
6715                 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
6716             }
6717           if (charset_map_loaded)
6718             {
6719               EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
6720
6721               if (STRINGP (start))
6722                 pbeg = XSTRING (start)->data;
6723               else
6724                 pbeg = BYTE_POS_ADDR (start_byte);
6725               p = pbeg + p_offset;
6726               pend = pbeg + pend_offset;
6727             }
6728         }
6729       pos++;
6730     }
6731
6732   tail = list;
6733   list = Qnil;
6734   for (; CONSP (tail); tail = XCDR (tail))
6735     {
6736       elt = XCAR (tail);
6737       if (CONSP (XCDR (XCDR (elt))))
6738         list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
6739                       list);
6740     }
6741
6742   return list;
6743 }
6744
6745
6746
6747 Lisp_Object
6748 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
6749      Lisp_Object start, end, coding_system, dst_object;
6750      int encodep, norecord;
6751 {
6752   struct coding_system coding;
6753   EMACS_INT from, from_byte, to, to_byte;
6754   Lisp_Object src_object;
6755
6756   CHECK_NUMBER_COERCE_MARKER (start);
6757   CHECK_NUMBER_COERCE_MARKER (end);
6758   if (NILP (coding_system))
6759     coding_system = Qno_conversion;
6760   else
6761     CHECK_CODING_SYSTEM (coding_system);
6762   src_object = Fcurrent_buffer ();
6763   if (NILP (dst_object))
6764     dst_object = src_object;
6765   else if (! EQ (dst_object, Qt))
6766     CHECK_BUFFER (dst_object);
6767
6768   validate_region (&start, &end);
6769   from = XFASTINT (start);
6770   from_byte = CHAR_TO_BYTE (from);
6771   to = XFASTINT (end);
6772   to_byte = CHAR_TO_BYTE (to);
6773
6774   setup_coding_system (coding_system, &coding);
6775   coding.mode |= CODING_MODE_LAST_BLOCK;
6776
6777   if (encodep)
6778     encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6779                           dst_object);
6780   else
6781     decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
6782                           dst_object);
6783   if (! norecord)
6784     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6785
6786   if (coding.result != CODING_RESULT_SUCCESS)
6787     error ("Code conversion error: %d", coding.result);
6788
6789   return (BUFFERP (dst_object)
6790           ? make_number (coding.produced_char)
6791           : coding.dst_object);
6792 }
6793
6794
6795 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
6796        3, 4, "r\nzCoding system: ",
6797        doc: /* Decode the current region from the specified coding system.
6798 When called from a program, takes four arguments:
6799         START, END, CODING-SYSTEM, and DESTINATION.
6800 START and END are buffer positions.
6801
6802 Optional 4th arguments DESTINATION specifies where the decoded text goes.
6803 If nil, the region between START and END is replace by the decoded text.
6804 If buffer, the decoded text is inserted in the buffer.
6805 If t, the decoded text is returned.
6806
6807 This function sets `last-coding-system-used' to the precise coding system
6808 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6809 not fully specified.)
6810 It returns the length of the decoded text.  */)
6811      (start, end, coding_system, destination)
6812      Lisp_Object start, end, coding_system, destination;
6813 {
6814   return code_convert_region (start, end, coding_system, destination, 0, 0);
6815 }
6816
6817 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
6818        3, 4, "r\nzCoding system: ",
6819        doc: /* Encode the current region by specified coding system.
6820 When called from a program, takes three arguments:
6821 START, END, and CODING-SYSTEM.  START and END are buffer positions.
6822
6823 Optional 4th arguments DESTINATION specifies where the encoded text goes.
6824 If nil, the region between START and END is replace by the encoded text.
6825 If buffer, the encoded text is inserted in the buffer.
6826 If t, the encoded text is returned.
6827
6828 This function sets `last-coding-system-used' to the precise coding system
6829 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6830 not fully specified.)
6831 It returns the length of the encoded text.  */)
6832   (start, end, coding_system, destination)
6833      Lisp_Object start, end, coding_system, destination;
6834 {
6835   return code_convert_region (start, end, coding_system, destination, 1, 0);
6836 }
6837
6838 Lisp_Object
6839 code_convert_string (string, coding_system, dst_object,
6840                      encodep, nocopy, norecord)
6841      Lisp_Object string, coding_system, dst_object;
6842      int encodep, nocopy, norecord;
6843 {
6844   struct coding_system coding;
6845   EMACS_INT chars, bytes;
6846
6847   CHECK_STRING (string);
6848   if (NILP (coding_system))
6849     {
6850       if (! norecord)
6851         Vlast_coding_system_used = Qno_conversion;
6852       if (NILP (dst_object))
6853         return (nocopy ? Fcopy_sequence (string) : string);
6854     }
6855
6856   if (NILP (coding_system))
6857     coding_system = Qno_conversion;
6858   else
6859     CHECK_CODING_SYSTEM (coding_system);
6860   if (NILP (dst_object))
6861     dst_object = Qt;
6862   else if (! EQ (dst_object, Qt))
6863     CHECK_BUFFER (dst_object);
6864
6865   setup_coding_system (coding_system, &coding);
6866   coding.mode |= CODING_MODE_LAST_BLOCK;
6867   chars = XSTRING (string)->size;
6868   bytes = STRING_BYTES (XSTRING (string));
6869   if (encodep)
6870     encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6871   else
6872     decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
6873   if (! norecord)
6874     Vlast_coding_system_used = CODING_ID_NAME (coding.id);
6875
6876   if (coding.result != CODING_RESULT_SUCCESS)
6877     error ("Code conversion error: %d", coding.result);
6878
6879   return (BUFFERP (dst_object)
6880           ? make_number (coding.produced_char)
6881           : coding.dst_object);
6882 }
6883
6884
6885 /* Encode or decode STRING according to CODING_SYSTEM.
6886    Do not set Vlast_coding_system_used.
6887
6888    This function is called only from macros DECODE_FILE and
6889    ENCODE_FILE, thus we ignore character composition.  */
6890
6891 Lisp_Object
6892 code_convert_string_norecord (string, coding_system, encodep)
6893      Lisp_Object string, coding_system;
6894      int encodep;
6895 {
6896   return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
6897 }
6898
6899
6900 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
6901        2, 4, 0,
6902        doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
6903
6904 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
6905 if the decoding operation is trivial.
6906
6907 Optional fourth arg BUFFER non-nil meant that the decoded text is
6908 inserted in BUFFER instead of returned as a astring.  In this case,
6909 the return value is BUFFER.
6910
6911 This function sets `last-coding-system-used' to the precise coding system
6912 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6913 not fully specified.  */)
6914   (string, coding_system, nocopy, buffer)
6915      Lisp_Object string, coding_system, nocopy, buffer;
6916 {
6917   return code_convert_string (string, coding_system, buffer,
6918                               0, ! NILP (nocopy), 0);
6919 }
6920
6921 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
6922        2, 4, 0,
6923        doc: /* Encode STRING to CODING-SYSTEM, and return the result.
6924
6925 Optional third arg NOCOPY non-nil means it is OK to return STRING
6926 itself if the encoding operation is trivial.
6927
6928 Optional fourth arg BUFFER non-nil meant that the encoded text is
6929 inserted in BUFFER instead of returned as a astring.  In this case,
6930 the return value is BUFFER.
6931
6932 This function sets `last-coding-system-used' to the precise coding system
6933 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
6934 not fully specified.)  */)
6935      (string, coding_system, nocopy, buffer)
6936      Lisp_Object string, coding_system, nocopy, buffer;
6937 {
6938   return code_convert_string (string, coding_system, buffer,
6939                               nocopy, ! NILP (nocopy), 1);
6940 }
6941
6942 \f
6943 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
6944        doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
6945 Return the corresponding character.  */)
6946      (code)
6947      Lisp_Object code;
6948 {
6949   Lisp_Object spec, attrs, val;
6950   struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
6951   int c;
6952
6953   CHECK_NATNUM (code);
6954   c = XFASTINT (code);
6955   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
6956   attrs = AREF (spec, 0);
6957
6958   if (ASCII_BYTE_P (c)
6959       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
6960     return code;
6961
6962   val = CODING_ATTR_CHARSET_LIST (attrs);
6963   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6964   charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
6965   charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
6966
6967   if (c <= 0x7F)
6968     charset = charset_roman;
6969   else if (c >= 0xA0 && c < 0xDF)
6970     {
6971       charset = charset_kana;
6972       c -= 0x80;
6973     }
6974   else
6975     {
6976       int s1 = c >> 8, s2 = c & 0xFF;
6977
6978       if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
6979           || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
6980         error ("Invalid code: %d", code);
6981       SJIS_TO_JIS (c);
6982       charset = charset_kanji;
6983     }
6984   c = DECODE_CHAR (charset, c);
6985   if (c < 0)
6986     error ("Invalid code: %d", code);
6987   return make_number (c);
6988 }
6989
6990
6991 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
6992        doc: /* Encode a Japanese character CHAR to shift_jis encoding.
6993 Return the corresponding code in SJIS.  */)
6994      (ch)
6995     Lisp_Object ch;
6996 {
6997   Lisp_Object spec, attrs, charset_list;
6998   int c;
6999   struct charset *charset;
7000   unsigned code;
7001
7002   CHECK_CHARACTER (ch);
7003   c = XFASTINT (ch);
7004   CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7005   attrs = AREF (spec, 0);
7006
7007   if (ASCII_CHAR_P (c)
7008       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7009     return ch;
7010
7011   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7012   charset = char_charset (c, charset_list, &code);
7013   if (code == CHARSET_INVALID_CODE (charset))
7014     error ("Can't encode by shift_jis encoding: %d", c);
7015   JIS_TO_SJIS (code);
7016
7017   return make_number (code);
7018 }
7019
7020 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7021        doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7022 Return the corresponding character.  */)
7023      (code)
7024      Lisp_Object code;
7025 {
7026   Lisp_Object spec, attrs, val;
7027   struct charset *charset_roman, *charset_big5, *charset;
7028   int c;
7029
7030   CHECK_NATNUM (code);
7031   c = XFASTINT (code);
7032   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7033   attrs = AREF (spec, 0);
7034
7035   if (ASCII_BYTE_P (c)
7036       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7037     return code;
7038
7039   val = CODING_ATTR_CHARSET_LIST (attrs);
7040   charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7041   charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7042
7043   if (c <= 0x7F)
7044     charset = charset_roman;
7045   else
7046     {
7047       int b1 = c >> 8, b2 = c & 0x7F;
7048       if (b1 < 0xA1 || b1 > 0xFE
7049           || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7050         error ("Invalid code: %d", code);
7051       charset = charset_big5;
7052     }
7053   c = DECODE_CHAR (charset, (unsigned )c);
7054   if (c < 0)
7055     error ("Invalid code: %d", code);
7056   return make_number (c);
7057 }
7058
7059 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7060        doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7061 Return the corresponding character code in Big5.  */)
7062      (ch)
7063      Lisp_Object ch;
7064 {
7065   Lisp_Object spec, attrs, charset_list;
7066   struct charset *charset;
7067   int c;
7068   unsigned code;
7069
7070   CHECK_CHARACTER (ch);
7071   c = XFASTINT (ch);
7072   CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7073   attrs = AREF (spec, 0);
7074   if (ASCII_CHAR_P (c)
7075       && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7076     return ch;
7077
7078   charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7079   charset = char_charset (c, charset_list, &code);
7080   if (code == CHARSET_INVALID_CODE (charset))
7081     error ("Can't encode by Big5 encoding: %d", c);
7082
7083   return make_number (code);
7084 }
7085
7086 \f
7087 DEFUN ("set-terminal-coding-system-internal",
7088        Fset_terminal_coding_system_internal,
7089        Sset_terminal_coding_system_internal, 1, 1, 0,
7090        doc: /* Internal use only.  */)
7091      (coding_system)
7092      Lisp_Object coding_system;
7093 {
7094   CHECK_SYMBOL (coding_system);
7095   setup_coding_system (Fcheck_coding_system (coding_system),
7096                         &terminal_coding);
7097
7098   /* We had better not send unsafe characters to terminal.  */
7099   terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7100   /* Characer composition should be disabled.  */
7101   terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7102   terminal_coding.src_multibyte = 1;
7103   terminal_coding.dst_multibyte = 0;
7104   return Qnil;
7105 }
7106
7107 DEFUN ("set-safe-terminal-coding-system-internal",
7108        Fset_safe_terminal_coding_system_internal,
7109        Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7110        doc: /* Internal use only.  */)
7111      (coding_system)
7112      Lisp_Object coding_system;
7113 {
7114   CHECK_SYMBOL (coding_system);
7115   setup_coding_system (Fcheck_coding_system (coding_system),
7116                        &safe_terminal_coding);
7117   /* Characer composition should be disabled.  */
7118   safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7119   safe_terminal_coding.src_multibyte = 1;
7120   safe_terminal_coding.dst_multibyte = 0;
7121   return Qnil;
7122 }
7123
7124 DEFUN ("terminal-coding-system",
7125        Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7126        doc: /* Return coding system specified for terminal output.  */)
7127      ()
7128 {
7129   return CODING_ID_NAME (terminal_coding.id);
7130 }
7131
7132 DEFUN ("set-keyboard-coding-system-internal",
7133        Fset_keyboard_coding_system_internal,
7134        Sset_keyboard_coding_system_internal, 1, 1, 0,
7135        doc: /* Internal use only.  */)
7136      (coding_system)
7137      Lisp_Object coding_system;
7138 {
7139   CHECK_SYMBOL (coding_system);
7140   setup_coding_system (Fcheck_coding_system (coding_system),
7141                        &keyboard_coding);
7142   /* Characer composition should be disabled.  */
7143   keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7144   return Qnil;
7145 }
7146
7147 DEFUN ("keyboard-coding-system",
7148        Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7149        doc: /* Return coding system specified for decoding keyboard input.  */)
7150      ()
7151 {
7152   return CODING_ID_NAME (keyboard_coding.id);
7153 }
7154
7155 \f
7156 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7157        Sfind_operation_coding_system,  1, MANY, 0,
7158        doc: /* Choose a coding system for an operation based on the target name.
7159 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7160 DECODING-SYSTEM is the coding system to use for decoding
7161 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7162 for encoding (in case OPERATION does encoding).
7163
7164 The first argument OPERATION specifies an I/O primitive:
7165   For file I/O, `insert-file-contents' or `write-region'.
7166   For process I/O, `call-process', `call-process-region', or `start-process'.
7167   For network I/O, `open-network-stream'.
7168
7169 The remaining arguments should be the same arguments that were passed
7170 to the primitive.  Depending on which primitive, one of those arguments
7171 is selected as the TARGET.  For example, if OPERATION does file I/O,
7172 whichever argument specifies the file name is TARGET.
7173
7174 TARGET has a meaning which depends on OPERATION:
7175   For file I/O, TARGET is a file name.
7176   For process I/O, TARGET is a process name.
7177   For network I/O, TARGET is a service name or a port number
7178
7179 This function looks up what specified for TARGET in,
7180 `file-coding-system-alist', `process-coding-system-alist',
7181 or `network-coding-system-alist' depending on OPERATION.
7182 They may specify a coding system, a cons of coding systems,
7183 or a function symbol to call.
7184 In the last case, we call the function with one argument,
7185 which is a list of all the arguments given to this function.
7186
7187 usage: (find-operation-coding-system OPERATION ARGUMENTS ...)  */)
7188      (nargs, args)
7189      int nargs;
7190      Lisp_Object *args;
7191 {
7192   Lisp_Object operation, target_idx, target, val;
7193   register Lisp_Object chain;
7194
7195   if (nargs < 2)
7196     error ("Too few arguments");
7197   operation = args[0];
7198   if (!SYMBOLP (operation)
7199       || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7200     error ("Invalid first arguement");
7201   if (nargs < 1 + XINT (target_idx))
7202     error ("Too few arguments for operation: %s",
7203            XSYMBOL (operation)->name->data);
7204   target = args[XINT (target_idx) + 1];
7205   if (!(STRINGP (target)
7206         || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7207     error ("Invalid %dth argument", XINT (target_idx) + 1);
7208
7209   chain = ((EQ (operation, Qinsert_file_contents)
7210             || EQ (operation, Qwrite_region))
7211            ? Vfile_coding_system_alist
7212            : (EQ (operation, Qopen_network_stream)
7213               ? Vnetwork_coding_system_alist
7214               : Vprocess_coding_system_alist));
7215   if (NILP (chain))
7216     return Qnil;
7217
7218   for (; CONSP (chain); chain = XCDR (chain))
7219     {
7220       Lisp_Object elt;
7221
7222       elt = XCAR (chain);
7223       if (CONSP (elt)
7224           && ((STRINGP (target)
7225                && STRINGP (XCAR (elt))
7226                && fast_string_match (XCAR (elt), target) >= 0)
7227               || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7228         {
7229           val = XCDR (elt);
7230           /* Here, if VAL is both a valid coding system and a valid
7231              function symbol, we return VAL as a coding system.  */
7232           if (CONSP (val))
7233             return val;
7234           if (! SYMBOLP (val))
7235             return Qnil;
7236           if (! NILP (Fcoding_system_p (val)))
7237             return Fcons (val, val);
7238           if (! NILP (Ffboundp (val)))
7239             {
7240               val = call1 (val, Flist (nargs, args));
7241               if (CONSP (val))
7242                 return val;
7243               if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7244                 return Fcons (val, val);
7245             }
7246           return Qnil;
7247         }
7248     }
7249   return Qnil;
7250 }
7251
7252 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7253        Sset_coding_system_priority, 1, MANY, 0,
7254        doc: /* Assign higher priority to the coding systems given as arguments.
7255 usage: (set-coding-system-priority CODING-SYSTEM ...)  */)
7256      (nargs, args)
7257      int nargs;
7258      Lisp_Object *args;
7259 {
7260   int i, j;
7261   int changed[coding_category_max];
7262   enum coding_category priorities[coding_category_max];
7263
7264   bzero (changed, sizeof changed);
7265
7266   for (i = j = 0; i < nargs; i++)
7267     {
7268       enum coding_category category;
7269       Lisp_Object spec, attrs;
7270
7271       CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7272       attrs = AREF (spec, 0);
7273       category = XINT (CODING_ATTR_CATEGORY (attrs));
7274       if (changed[category])
7275         /* Ignore this coding system because a coding system of the
7276            same category already had a higher priority.  */
7277         continue;
7278       changed[category] = 1;
7279       priorities[j++] = category;
7280       if (coding_categories[category].id >= 0
7281           && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7282         setup_coding_system (args[i], &coding_categories[category]);
7283     }
7284
7285   /* Now we have decided top J priorities.  Reflect the order of the
7286      original priorities to the remaining priorities.  */
7287
7288   for (i = j, j = 0; i < coding_category_max; i++, j++)
7289     {
7290       while (j < coding_category_max
7291              && changed[coding_priorities[j]])
7292         j++;
7293       if (j == coding_category_max)
7294         abort ();
7295       priorities[i] = coding_priorities[j];
7296     }
7297
7298   bcopy (priorities, coding_priorities, sizeof priorities);
7299   return Qnil;
7300 }
7301
7302 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7303        Scoding_system_priority_list, 0, 1, 0,
7304        doc: /* Return a list of coding systems ordered by their priorities.
7305 HIGHESTP non-nil means just return the highest priority one.  */)
7306      (highestp)
7307      Lisp_Object highestp;
7308 {
7309   int i;
7310   Lisp_Object val;
7311
7312   for (i = 0, val = Qnil; i < coding_category_max; i++)
7313     {
7314       enum coding_category category = coding_priorities[i];
7315       int id = coding_categories[category].id;
7316       Lisp_Object attrs;
7317
7318       if (id < 0)
7319         continue;
7320       attrs = CODING_ID_ATTRS (id);
7321       if (! NILP (highestp))
7322         return CODING_ATTR_BASE_NAME (attrs);
7323       val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7324     }
7325   return Fnreverse (val);
7326 }
7327
7328 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7329
7330 static Lisp_Object
7331 make_subsidiaries (base)
7332      Lisp_Object base;
7333 {
7334   Lisp_Object subsidiaries;
7335   int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7336   char *buf = (char *) alloca (base_name_len + 6);
7337   int i;
7338
7339   bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7340   subsidiaries = Fmake_vector (make_number (3), Qnil);
7341   for (i = 0; i < 3; i++)
7342     {
7343       bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7344       ASET (subsidiaries, i, intern (buf));
7345     }
7346   return subsidiaries;
7347 }
7348
7349
7350 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7351        Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7352        doc: /* For internal use only.
7353 usage: (define-coding-system-internal ...)  */)
7354      (nargs, args)
7355      int nargs;
7356      Lisp_Object *args;
7357 {
7358   Lisp_Object name;
7359   Lisp_Object spec_vec;         /* [ ATTRS ALIASE EOL_TYPE ] */
7360   Lisp_Object attrs;            /* Vector of attributes.  */
7361   Lisp_Object eol_type;
7362   Lisp_Object aliases;
7363   Lisp_Object coding_type, charset_list, safe_charsets;
7364   enum coding_category category;
7365   Lisp_Object tail, val;
7366   int max_charset_id = 0;
7367   int i;
7368
7369   if (nargs < coding_arg_max)
7370     goto short_args;
7371
7372   attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7373
7374   name = args[coding_arg_name];
7375   CHECK_SYMBOL (name);
7376   CODING_ATTR_BASE_NAME (attrs) = name;
7377
7378   val = args[coding_arg_mnemonic];
7379   if (! STRINGP (val))
7380     CHECK_CHARACTER (val);
7381   CODING_ATTR_MNEMONIC (attrs) = val;
7382
7383   coding_type = args[coding_arg_coding_type];
7384   CHECK_SYMBOL (coding_type);
7385   CODING_ATTR_TYPE (attrs) = coding_type;
7386
7387   charset_list = args[coding_arg_charset_list];
7388   if (SYMBOLP (charset_list))
7389     {
7390       if (EQ (charset_list, Qiso_2022))
7391         {
7392           if (! EQ (coding_type, Qiso_2022))
7393             error ("Invalid charset-list");
7394           charset_list = Viso_2022_charset_list;
7395         }
7396       else if (EQ (charset_list, Qemacs_mule))
7397         {
7398           if (! EQ (coding_type, Qemacs_mule))
7399             error ("Invalid charset-list");
7400           charset_list = Vemacs_mule_charset_list;
7401         }
7402       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7403         if (max_charset_id < XFASTINT (XCAR (tail)))
7404           max_charset_id = XFASTINT (XCAR (tail));
7405     }
7406   else
7407     {
7408       charset_list = Fcopy_sequence (charset_list);
7409       for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7410         {
7411           struct charset *charset;
7412
7413           val = Fcar (tail);
7414           CHECK_CHARSET_GET_CHARSET (val, charset);
7415           if (EQ (coding_type, Qiso_2022)
7416               ? CHARSET_ISO_FINAL (charset) < 0
7417               : EQ (coding_type, Qemacs_mule)
7418               ? CHARSET_EMACS_MULE_ID (charset) < 0
7419               : 0)
7420             error ("Can't handle charset `%s'",
7421                    XSYMBOL (CHARSET_NAME (charset))->name->data);
7422
7423           XCAR (tail) = make_number (charset->id);
7424           if (max_charset_id < charset->id)
7425             max_charset_id = charset->id;
7426         }
7427     }
7428   CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7429
7430   safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7431                                 make_number (255));
7432   for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7433     XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7434   CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7435
7436   val = args[coding_arg_decode_translation_table];
7437   if (! NILP (val))
7438     CHECK_CHAR_TABLE (val);
7439   CODING_ATTR_DECODE_TBL (attrs) = val;
7440
7441   val = args[coding_arg_encode_translation_table];
7442   if (! NILP (val))
7443     CHECK_CHAR_TABLE (val);
7444   CODING_ATTR_ENCODE_TBL (attrs) = val;
7445
7446   val = args[coding_arg_post_read_conversion];
7447   CHECK_SYMBOL (val);
7448   CODING_ATTR_POST_READ (attrs) = val;
7449
7450   val = args[coding_arg_pre_write_conversion];
7451   CHECK_SYMBOL (val);
7452   CODING_ATTR_PRE_WRITE (attrs) = val;
7453
7454   val = args[coding_arg_default_char];
7455   if (NILP (val))
7456     CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7457   else
7458     {
7459       CHECK_CHARACTER (val);
7460       CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7461     }
7462
7463   val = args[coding_arg_plist];
7464   CHECK_LIST (val);
7465   CODING_ATTR_PLIST (attrs) = val;
7466
7467   if (EQ (coding_type, Qcharset))
7468     {
7469       /* Generate a lisp vector of 256 elements.  Each element is nil,
7470          integer, or a list of charset IDs.
7471
7472          If Nth element is nil, the byte code N is invalid in this
7473          coding system.
7474
7475          If Nth element is a number NUM, N is the first byte of a
7476          charset whose ID is NUM.
7477
7478          If Nth element is a list of charset IDs, N is the first byte
7479          of one of them.  The list is sorted by dimensions of the
7480          charsets.  A charset of smaller dimension comes firtst.
7481       */
7482       val = Fmake_vector (make_number (256), Qnil);
7483
7484       for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7485         {
7486           struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
7487           int dim = CHARSET_DIMENSION (charset);
7488           int idx = (dim - 1) * 4;
7489
7490           for (i = charset->code_space[idx];
7491                i <= charset->code_space[idx + 1]; i++)
7492             {
7493               Lisp_Object tmp, tmp2;
7494               int dim2;
7495
7496               tmp = AREF (val, i);
7497               if (NILP (tmp))
7498                 tmp = XCAR (tail);
7499               else if (NUMBERP (tmp))
7500                 {
7501                   dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
7502                   if (dim < dim2)
7503                     tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
7504                   else
7505                     tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
7506                 }
7507               else
7508                 {
7509                   for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
7510                     {
7511                       dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
7512                       if (dim < dim2)
7513                         break;
7514                     }
7515                   if (NILP (tmp2))
7516                     tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
7517                   else
7518                     {
7519                       XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
7520                       XSETCAR (tmp2, XCAR (tail));
7521                     }
7522                 }
7523               ASET (val, i, tmp);
7524             }
7525         }
7526       ASET (attrs, coding_attr_charset_valids, val);
7527       category = coding_category_charset;
7528     }
7529   else if (EQ (coding_type, Qccl))
7530     {
7531       Lisp_Object valids;
7532
7533       if (nargs < coding_arg_ccl_max)
7534         goto short_args;
7535
7536       val = args[coding_arg_ccl_decoder];
7537       CHECK_CCL_PROGRAM (val);
7538       if (VECTORP (val))
7539         val = Fcopy_sequence (val);
7540       ASET (attrs, coding_attr_ccl_decoder, val);
7541
7542       val = args[coding_arg_ccl_encoder];
7543       CHECK_CCL_PROGRAM (val);
7544       if (VECTORP (val))
7545         val = Fcopy_sequence (val);
7546       ASET (attrs, coding_attr_ccl_encoder, val);
7547
7548       val = args[coding_arg_ccl_valids];
7549       valids = Fmake_string (make_number (256), make_number (0));
7550       for (tail = val; !NILP (tail); tail = Fcdr (tail))
7551         {
7552           val = Fcar (tail);
7553           if (INTEGERP (val))
7554             ASET (valids, XINT (val), 1);
7555           else
7556             {
7557               int from, to;
7558
7559               CHECK_CONS (val);
7560               CHECK_NUMBER (XCAR (val));
7561               CHECK_NUMBER (XCDR (val));
7562               from = XINT (XCAR (val));
7563               to = XINT (XCDR (val));
7564               for (i = from; i <= to; i++)
7565                 ASET (valids, i, 1);
7566             }
7567         }
7568       ASET (attrs, coding_attr_ccl_valids, valids);
7569
7570       category = coding_category_ccl;
7571     }
7572   else if (EQ (coding_type, Qutf_16))
7573     {
7574       Lisp_Object bom, endian;
7575
7576       if (nargs < coding_arg_utf16_max)
7577         goto short_args;
7578
7579       bom = args[coding_arg_utf16_bom];
7580       if (! NILP (bom) && ! EQ (bom, Qt))
7581         {
7582           CHECK_CONS (bom);
7583           CHECK_CODING_SYSTEM (XCAR (bom));
7584           CHECK_CODING_SYSTEM (XCDR (bom));
7585         }
7586       ASET (attrs, coding_attr_utf_16_bom, bom);
7587
7588       endian = args[coding_arg_utf16_endian];
7589       ASET (attrs, coding_attr_utf_16_endian, endian);
7590
7591       category = (CONSP (bom)
7592                   ? coding_category_utf_16_auto
7593                   : NILP (bom)
7594                   ? (NILP (endian)
7595                      ? coding_category_utf_16_be_nosig
7596                      : coding_category_utf_16_le_nosig)
7597                   : (NILP (endian)
7598                      ? coding_category_utf_16_be
7599                      : coding_category_utf_16_le));
7600     }
7601   else if (EQ (coding_type, Qiso_2022))
7602     {
7603       Lisp_Object initial, reg_usage, request, flags;
7604       int i, id;
7605
7606       if (nargs < coding_arg_iso2022_max)
7607         goto short_args;
7608
7609       initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
7610       CHECK_VECTOR (initial);
7611       for (i = 0; i < 4; i++)
7612         {
7613           val = Faref (initial, make_number (i));
7614           if (! NILP (val))
7615             {
7616               CHECK_CHARSET_GET_ID (val, id);
7617               ASET (initial, i, make_number (id));
7618             }
7619           else
7620             ASET (initial, i, make_number (-1));
7621         }
7622
7623       reg_usage = args[coding_arg_iso2022_reg_usage];
7624       CHECK_CONS (reg_usage);
7625       CHECK_NATNUM (XCAR (reg_usage));
7626       CHECK_NATNUM (XCDR (reg_usage));
7627
7628       request = Fcopy_sequence (args[coding_arg_iso2022_request]);
7629       for (tail = request; ! NILP (tail); tail = Fcdr (tail))
7630         {
7631           int id;
7632
7633           val = Fcar (tail);
7634           CHECK_CONS (val);
7635           CHECK_CHARSET_GET_ID (XCAR (val), id);
7636           CHECK_NATNUM (XCDR (val));
7637           if (XINT (XCDR (val)) >= 4)
7638             error ("Invalid graphic register number: %d", XINT (XCDR (val)));
7639           XCAR (val) = make_number (id);
7640         }
7641
7642       flags = args[coding_arg_iso2022_flags];
7643       CHECK_NATNUM (flags);
7644       i = XINT (flags);
7645       if (EQ (args[coding_arg_charset_list], Qiso_2022))
7646         flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
7647
7648       ASET (attrs, coding_attr_iso_initial, initial);
7649       ASET (attrs, coding_attr_iso_usage, reg_usage);
7650       ASET (attrs, coding_attr_iso_request, request);
7651       ASET (attrs, coding_attr_iso_flags, flags);
7652       setup_iso_safe_charsets (attrs);
7653
7654       if (i & CODING_ISO_FLAG_SEVEN_BITS)
7655         category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7656                           | CODING_ISO_FLAG_SINGLE_SHIFT))
7657                     ? coding_category_iso_7_else
7658                     : EQ (args[coding_arg_charset_list], Qiso_2022)
7659                     ? coding_category_iso_7
7660                     : coding_category_iso_7_tight);
7661       else
7662         {
7663           int id = XINT (AREF (initial, 1));
7664
7665           category = (((i & (CODING_ISO_FLAG_LOCKING_SHIFT
7666                              | CODING_ISO_FLAG_SINGLE_SHIFT))
7667                        || EQ (args[coding_arg_charset_list], Qiso_2022)
7668                        || id < 0)
7669                       ? coding_category_iso_8_else
7670                       : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
7671                       ? coding_category_iso_8_1
7672                       : coding_category_iso_8_2);
7673         }
7674     }
7675   else if (EQ (coding_type, Qemacs_mule))
7676     {
7677       if (EQ (args[coding_arg_charset_list], Qemacs_mule))
7678         ASET (attrs, coding_attr_emacs_mule_full, Qt);
7679
7680       category = coding_category_emacs_mule;
7681     }
7682   else if (EQ (coding_type, Qshift_jis))
7683     {
7684
7685       struct charset *charset;
7686
7687       if (XINT (Flength (charset_list)) != 3)
7688         error ("There should be just three charsets");
7689
7690       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7691       if (CHARSET_DIMENSION (charset) != 1)
7692         error ("Dimension of charset %s is not one",
7693                XSYMBOL (CHARSET_NAME (charset))->name->data);
7694
7695       charset_list = XCDR (charset_list);
7696       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7697       if (CHARSET_DIMENSION (charset) != 1)
7698         error ("Dimension of charset %s is not one",
7699                XSYMBOL (CHARSET_NAME (charset))->name->data);
7700
7701       charset_list = XCDR (charset_list);
7702       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7703       if (CHARSET_DIMENSION (charset) != 2)
7704         error ("Dimension of charset %s is not two",
7705                XSYMBOL (CHARSET_NAME (charset))->name->data);
7706
7707       category = coding_category_sjis;
7708       Vsjis_coding_system = name;
7709     }
7710   else if (EQ (coding_type, Qbig5))
7711     {
7712       struct charset *charset;
7713
7714       if (XINT (Flength (charset_list)) != 2)
7715         error ("There should be just two charsets");
7716
7717       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7718       if (CHARSET_DIMENSION (charset) != 1)
7719         error ("Dimension of charset %s is not one",
7720                XSYMBOL (CHARSET_NAME (charset))->name->data);
7721
7722       charset_list = XCDR (charset_list);
7723       charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
7724       if (CHARSET_DIMENSION (charset) != 2)
7725         error ("Dimension of charset %s is not two",
7726                XSYMBOL (CHARSET_NAME (charset))->name->data);
7727
7728       category = coding_category_big5;
7729       Vbig5_coding_system = name;
7730     }
7731   else if (EQ (coding_type, Qraw_text))
7732     category = coding_category_raw_text;
7733   else if (EQ (coding_type, Qutf_8))
7734     category = coding_category_utf_8;
7735   else if (EQ (coding_type, Qundecided))
7736     category = coding_category_undecided;
7737   else
7738     error ("Invalid coding system type: %s",
7739            XSYMBOL (coding_type)->name->data);
7740
7741   CODING_ATTR_CATEGORY (attrs) = make_number (category);
7742
7743   eol_type = args[coding_arg_eol_type];
7744   if (! NILP (eol_type)
7745       && ! EQ (eol_type, Qunix)
7746       && ! EQ (eol_type, Qdos)
7747       && ! EQ (eol_type, Qmac))
7748     error ("Invalid eol-type");
7749
7750   aliases = Fcons (name, Qnil);
7751
7752   if (NILP (eol_type))
7753     {
7754       eol_type = make_subsidiaries (name);
7755       for (i = 0; i < 3; i++)
7756         {
7757           Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
7758
7759           this_name = AREF (eol_type, i);
7760           this_aliases = Fcons (this_name, Qnil);
7761           this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
7762           this_spec = Fmake_vector (make_number (3), attrs);
7763           ASET (this_spec, 1, this_aliases);
7764           ASET (this_spec, 2, this_eol_type);
7765           Fputhash (this_name, this_spec, Vcoding_system_hash_table);
7766           Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
7767           Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
7768                                         Vcoding_system_alist);
7769         }
7770     }
7771
7772   spec_vec = Fmake_vector (make_number (3), attrs);
7773   ASET (spec_vec, 1, aliases);
7774   ASET (spec_vec, 2, eol_type);
7775
7776   Fputhash (name, spec_vec, Vcoding_system_hash_table);
7777   Vcoding_system_list = Fcons (name, Vcoding_system_list);
7778   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
7779                                 Vcoding_system_alist);
7780
7781   {
7782     int id = coding_categories[category].id;
7783
7784     if (id < 0 || EQ (name, CODING_ID_NAME (id)))
7785       setup_coding_system (name, &coding_categories[category]);
7786   }
7787
7788   return Qnil;
7789
7790  short_args:
7791   return Fsignal (Qwrong_number_of_arguments,
7792                   Fcons (intern ("define-coding-system-internal"),
7793                          make_number (nargs)));
7794 }
7795
7796 /* Fixme: should this record the alias relationships for
7797    diagnostics?  */
7798 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
7799        Sdefine_coding_system_alias, 2, 2, 0,
7800        doc: /* Define ALIAS as an alias for CODING-SYSTEM.  */)
7801      (alias, coding_system)
7802      Lisp_Object alias, coding_system;
7803 {
7804   Lisp_Object spec, aliases, eol_type;
7805
7806   CHECK_SYMBOL (alias);
7807   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7808   aliases = AREF (spec, 1);
7809   while (!NILP (XCDR (aliases)))
7810     aliases = XCDR (aliases);
7811   XCDR (aliases) = Fcons (alias, Qnil);
7812
7813   eol_type = AREF (spec, 2);
7814   if (VECTORP (eol_type))
7815     {
7816       Lisp_Object subsidiaries;
7817       int i;
7818
7819       subsidiaries = make_subsidiaries (alias);
7820       for (i = 0; i < 3; i++)
7821         Fdefine_coding_system_alias (AREF (subsidiaries, i),
7822                                      AREF (eol_type, i));
7823
7824       ASET (spec, 2, subsidiaries);
7825     }
7826
7827   Fputhash (alias, spec, Vcoding_system_hash_table);
7828   Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
7829                                 Vcoding_system_alist);
7830
7831   return Qnil;
7832 }
7833
7834 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
7835        1, 1, 0,
7836        doc: /* Return the base of CODING-SYSTEM.
7837 Any alias or subsidiary coding system is not a base coding system.  */)
7838   (coding_system)
7839      Lisp_Object coding_system;
7840 {
7841   Lisp_Object spec, attrs;
7842
7843   if (NILP (coding_system))
7844     return (Qno_conversion);
7845   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7846   attrs = AREF (spec, 0);
7847   return CODING_ATTR_BASE_NAME (attrs);
7848 }
7849
7850 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
7851        1, 1, 0,
7852        doc: "Return the property list of CODING-SYSTEM.")
7853      (coding_system)
7854      Lisp_Object coding_system;
7855 {
7856   Lisp_Object spec, attrs;
7857
7858   if (NILP (coding_system))
7859     coding_system = Qno_conversion;
7860   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7861   attrs = AREF (spec, 0);
7862   return CODING_ATTR_PLIST (attrs);
7863 }
7864
7865
7866 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
7867        1, 1, 0,
7868        doc: /* Return the list of aliases of CODING-SYSTEM.  */)
7869      (coding_system)
7870      Lisp_Object coding_system;
7871 {
7872   Lisp_Object spec;
7873
7874   if (NILP (coding_system))
7875     coding_system = Qno_conversion;
7876   CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
7877   return AREF (spec, 1);
7878 }
7879
7880 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
7881        Scoding_system_eol_type, 1, 1, 0,
7882        doc: /* Return eol-type of CODING-SYSTEM.
7883 An eol-type is integer 0, 1, 2, or a vector of coding systems.
7884
7885 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
7886 and CR respectively.
7887
7888 A vector value indicates that a format of end-of-line should be
7889 detected automatically.  Nth element of the vector is the subsidiary
7890 coding system whose eol-type is N.  */)
7891      (coding_system)
7892      Lisp_Object coding_system;
7893 {
7894   Lisp_Object spec, eol_type;
7895   int n;
7896
7897   if (NILP (coding_system))
7898     coding_system = Qno_conversion;
7899   if (! CODING_SYSTEM_P (coding_system))
7900     return Qnil;
7901   spec = CODING_SYSTEM_SPEC (coding_system);
7902   eol_type = AREF (spec, 2);
7903   if (VECTORP (eol_type))
7904     return Fcopy_sequence (eol_type);
7905   n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
7906   return make_number (n);
7907 }
7908
7909 #endif /* emacs */
7910
7911 \f
7912 /*** 9. Post-amble ***/
7913
7914 void
7915 init_coding_once ()
7916 {
7917   int i;
7918
7919   for (i = 0; i < coding_category_max; i++)
7920     {
7921       coding_categories[i].id = -1;
7922       coding_priorities[i] = i;
7923     }
7924
7925   /* ISO2022 specific initialize routine.  */
7926   for (i = 0; i < 0x20; i++)
7927     iso_code_class[i] = ISO_control_0;
7928   for (i = 0x21; i < 0x7F; i++)
7929     iso_code_class[i] = ISO_graphic_plane_0;
7930   for (i = 0x80; i < 0xA0; i++)
7931     iso_code_class[i] = ISO_control_1;
7932   for (i = 0xA1; i < 0xFF; i++)
7933     iso_code_class[i] = ISO_graphic_plane_1;
7934   iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
7935   iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
7936   iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
7937   iso_code_class[ISO_CODE_SO] = ISO_shift_out;
7938   iso_code_class[ISO_CODE_SI] = ISO_shift_in;
7939   iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
7940   iso_code_class[ISO_CODE_ESC] = ISO_escape;
7941   iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
7942   iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
7943   iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
7944
7945   inhibit_pre_post_conversion = 0;
7946
7947   for (i = 0; i < 256; i++)
7948     {
7949       emacs_mule_bytes[i] = 1;
7950     }
7951   emacs_mule_bytes[LEADING_CODE_PRIVATE_11] = 3;
7952   emacs_mule_bytes[LEADING_CODE_PRIVATE_12] = 3;
7953   emacs_mule_bytes[LEADING_CODE_PRIVATE_21] = 4;
7954   emacs_mule_bytes[LEADING_CODE_PRIVATE_22] = 4;
7955 }
7956
7957 #ifdef emacs
7958
7959 void
7960 syms_of_coding ()
7961 {
7962   staticpro (&Vcoding_system_hash_table);
7963   Vcoding_system_hash_table = Fmakehash (Qeq);
7964
7965   staticpro (&Vsjis_coding_system);
7966   Vsjis_coding_system = Qnil;
7967
7968   staticpro (&Vbig5_coding_system);
7969   Vbig5_coding_system = Qnil;
7970
7971   staticpro (&Vcode_conversion_work_buf_list);
7972   Vcode_conversion_work_buf_list = Qnil;
7973
7974   staticpro (&Vcode_conversion_reused_work_buf);
7975   Vcode_conversion_reused_work_buf = Qnil;
7976
7977   DEFSYM (Qcharset, "charset");
7978   DEFSYM (Qtarget_idx, "target-idx");
7979   DEFSYM (Qcoding_system_history, "coding-system-history");
7980   Fset (Qcoding_system_history, Qnil);
7981
7982   /* Target FILENAME is the first argument.  */
7983   Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
7984   /* Target FILENAME is the third argument.  */
7985   Fput (Qwrite_region, Qtarget_idx, make_number (2));
7986
7987   DEFSYM (Qcall_process, "call-process");
7988   /* Target PROGRAM is the first argument.  */
7989   Fput (Qcall_process, Qtarget_idx, make_number (0));
7990
7991   DEFSYM (Qcall_process_region, "call-process-region");
7992   /* Target PROGRAM is the third argument.  */
7993   Fput (Qcall_process_region, Qtarget_idx, make_number (2));
7994
7995   DEFSYM (Qstart_process, "start-process");
7996   /* Target PROGRAM is the third argument.  */
7997   Fput (Qstart_process, Qtarget_idx, make_number (2));
7998
7999   DEFSYM (Qopen_network_stream, "open-network-stream");
8000   /* Target SERVICE is the fourth argument.  */
8001   Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8002
8003   DEFSYM (Qcoding_system, "coding-system");
8004   DEFSYM (Qcoding_aliases, "coding-aliases");
8005
8006   DEFSYM (Qeol_type, "eol-type");
8007   DEFSYM (Qunix, "unix");
8008   DEFSYM (Qdos, "dos");
8009
8010   DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8011   DEFSYM (Qpost_read_conversion, "post-read-conversion");
8012   DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8013   DEFSYM (Qdefault_char, "default-char");
8014   DEFSYM (Qundecided, "undecided");
8015   DEFSYM (Qno_conversion, "no-conversion");
8016   DEFSYM (Qraw_text, "raw-text");
8017
8018   DEFSYM (Qiso_2022, "iso-2022");
8019
8020   DEFSYM (Qutf_8, "utf-8");
8021
8022   DEFSYM (Qutf_16, "utf-16");
8023   DEFSYM (Qutf_16_be, "utf-16-be");
8024   DEFSYM (Qutf_16_be_nosig, "utf-16-be-nosig");
8025   DEFSYM (Qutf_16_le, "utf-16-l3");
8026   DEFSYM (Qutf_16_le_nosig, "utf-16-le-nosig");
8027   DEFSYM (Qsignature, "signature");
8028   DEFSYM (Qendian, "endian");
8029   DEFSYM (Qbig, "big");
8030   DEFSYM (Qlittle, "little");
8031
8032   DEFSYM (Qshift_jis, "shift-jis");
8033   DEFSYM (Qbig5, "big5");
8034
8035   DEFSYM (Qcoding_system_p, "coding-system-p");
8036
8037   DEFSYM (Qcoding_system_error, "coding-system-error");
8038   Fput (Qcoding_system_error, Qerror_conditions,
8039         Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8040   Fput (Qcoding_system_error, Qerror_message,
8041         build_string ("Invalid coding system"));
8042
8043   /* Intern this now in case it isn't already done.
8044      Setting this variable twice is harmless.
8045      But don't staticpro it here--that is done in alloc.c.  */
8046   Qchar_table_extra_slots = intern ("char-table-extra-slots");
8047
8048   DEFSYM (Qtranslation_table, "translation-table");
8049   Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8050   DEFSYM (Qtranslation_table_id, "translation-table-id");
8051   DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8052   DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8053
8054   DEFSYM (Qvalid_codes, "valid-codes");
8055
8056   DEFSYM (Qemacs_mule, "emacs-mule");
8057
8058   Vcoding_category_table
8059     = Fmake_vector (make_number (coding_category_max), Qnil);
8060   staticpro (&Vcoding_category_table);
8061   /* Followings are target of code detection.  */
8062   ASET (Vcoding_category_table, coding_category_iso_7,
8063         intern ("coding-category-iso-7"));
8064   ASET (Vcoding_category_table, coding_category_iso_7_tight,
8065         intern ("coding-category-iso-7-tight"));
8066   ASET (Vcoding_category_table, coding_category_iso_8_1,
8067         intern ("coding-category-iso-8-1"));
8068   ASET (Vcoding_category_table, coding_category_iso_8_2,
8069         intern ("coding-category-iso-8-2"));
8070   ASET (Vcoding_category_table, coding_category_iso_7_else,
8071         intern ("coding-category-iso-7-else"));
8072   ASET (Vcoding_category_table, coding_category_iso_8_else,
8073         intern ("coding-category-iso-8-else"));
8074   ASET (Vcoding_category_table, coding_category_utf_8,
8075         intern ("coding-category-utf-8"));
8076   ASET (Vcoding_category_table, coding_category_utf_16_be,
8077         intern ("coding-category-utf-16-be"));
8078   ASET (Vcoding_category_table, coding_category_utf_16_le,
8079         intern ("coding-category-utf-16-le"));
8080   ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8081         intern ("coding-category-utf-16-be-nosig"));
8082   ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8083         intern ("coding-category-utf-16-le-nosig"));
8084   ASET (Vcoding_category_table, coding_category_charset,
8085         intern ("coding-category-charset"));
8086   ASET (Vcoding_category_table, coding_category_sjis,
8087         intern ("coding-category-sjis"));
8088   ASET (Vcoding_category_table, coding_category_big5,
8089         intern ("coding-category-big5"));
8090   ASET (Vcoding_category_table, coding_category_ccl,
8091         intern ("coding-category-ccl"));
8092   ASET (Vcoding_category_table, coding_category_emacs_mule,
8093         intern ("coding-category-emacs-mule"));
8094   /* Followings are NOT target of code detection.  */
8095   ASET (Vcoding_category_table, coding_category_raw_text,
8096         intern ("coding-category-raw-text"));
8097   ASET (Vcoding_category_table, coding_category_undecided,
8098         intern ("coding-category-undecided"));
8099
8100   defsubr (&Scoding_system_p);
8101   defsubr (&Sread_coding_system);
8102   defsubr (&Sread_non_nil_coding_system);
8103   defsubr (&Scheck_coding_system);
8104   defsubr (&Sdetect_coding_region);
8105   defsubr (&Sdetect_coding_string);
8106   defsubr (&Sfind_coding_systems_region_internal);
8107   defsubr (&Scheck_coding_systems_region);
8108   defsubr (&Sdecode_coding_region);
8109   defsubr (&Sencode_coding_region);
8110   defsubr (&Sdecode_coding_string);
8111   defsubr (&Sencode_coding_string);
8112   defsubr (&Sdecode_sjis_char);
8113   defsubr (&Sencode_sjis_char);
8114   defsubr (&Sdecode_big5_char);
8115   defsubr (&Sencode_big5_char);
8116   defsubr (&Sset_terminal_coding_system_internal);
8117   defsubr (&Sset_safe_terminal_coding_system_internal);
8118   defsubr (&Sterminal_coding_system);
8119   defsubr (&Sset_keyboard_coding_system_internal);
8120   defsubr (&Skeyboard_coding_system);
8121   defsubr (&Sfind_operation_coding_system);
8122   defsubr (&Sset_coding_system_priority);
8123   defsubr (&Sdefine_coding_system_internal);
8124   defsubr (&Sdefine_coding_system_alias);
8125   defsubr (&Scoding_system_base);
8126   defsubr (&Scoding_system_plist);
8127   defsubr (&Scoding_system_aliases);
8128   defsubr (&Scoding_system_eol_type);
8129   defsubr (&Scoding_system_priority_list);
8130
8131   DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8132                doc: /* List of coding systems.
8133
8134 Do not alter the value of this variable manually.  This variable should be
8135 updated by the functions `define-coding-system' and
8136 `define-coding-system-alias'.  */);
8137   Vcoding_system_list = Qnil;
8138
8139   DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8140                doc: /* Alist of coding system names.
8141 Each element is one element list of coding system name.
8142 This variable is given to `completing-read' as TABLE argument.
8143
8144 Do not alter the value of this variable manually.  This variable should be
8145 updated by the functions `make-coding-system' and
8146 `define-coding-system-alias'.  */);
8147   Vcoding_system_alist = Qnil;
8148
8149   DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8150                doc: /* List of coding-categories (symbols) ordered by priority.
8151
8152 On detecting a coding system, Emacs tries code detection algorithms
8153 associated with each coding-category one by one in this order.  When
8154 one algorithm agrees with a byte sequence of source text, the coding
8155 system bound to the corresponding coding-category is selected.  */);
8156   {
8157     int i;
8158
8159     Vcoding_category_list = Qnil;
8160     for (i = coding_category_max - 1; i >= 0; i--)
8161       Vcoding_category_list
8162         = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8163                  Vcoding_category_list);
8164   }
8165
8166   DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8167                doc: /* Specify the coding system for read operations.
8168 It is useful to bind this variable with `let', but do not set it globally.
8169 If the value is a coding system, it is used for decoding on read operation.
8170 If not, an appropriate element is used from one of the coding system alists:
8171 There are three such tables, `file-coding-system-alist',
8172 `process-coding-system-alist', and `network-coding-system-alist'.  */);
8173   Vcoding_system_for_read = Qnil;
8174
8175   DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8176                doc: /* Specify the coding system for write operations.
8177 Programs bind this variable with `let', but you should not set it globally.
8178 If the value is a coding system, it is used for encoding of output,
8179 when writing it to a file and when sending it to a file or subprocess.
8180
8181 If this does not specify a coding system, an appropriate element
8182 is used from one of the coding system alists:
8183 There are three such tables, `file-coding-system-alist',
8184 `process-coding-system-alist', and `network-coding-system-alist'.
8185 For output to files, if the above procedure does not specify a coding system,
8186 the value of `buffer-file-coding-system' is used.  */);
8187   Vcoding_system_for_write = Qnil;
8188
8189   DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8190                doc: /*
8191 Coding system used in the latest file or process I/O.  */);
8192   Vlast_coding_system_used = Qnil;
8193
8194   DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8195                doc: /*
8196 *Non-nil means always inhibit code conversion of end-of-line format.
8197 See info node `Coding Systems' and info node `Text and Binary' concerning
8198 such conversion.  */);
8199   inhibit_eol_conversion = 0;
8200
8201   DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8202                doc: /*
8203 Non-nil means process buffer inherits coding system of process output.
8204 Bind it to t if the process output is to be treated as if it were a file
8205 read from some filesystem.  */);
8206   inherit_process_coding_system = 0;
8207
8208   DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8209                doc: /*
8210 Alist to decide a coding system to use for a file I/O operation.
8211 The format is ((PATTERN . VAL) ...),
8212 where PATTERN is a regular expression matching a file name,
8213 VAL is a coding system, a cons of coding systems, or a function symbol.
8214 If VAL is a coding system, it is used for both decoding and encoding
8215 the file contents.
8216 If VAL is a cons of coding systems, the car part is used for decoding,
8217 and the cdr part is used for encoding.
8218 If VAL is a function symbol, the function must return a coding system
8219 or a cons of coding systems which are used as above.  The function gets
8220 the arguments with which `find-operation-coding-systems' was called.
8221
8222 See also the function `find-operation-coding-system'
8223 and the variable `auto-coding-alist'.  */);
8224   Vfile_coding_system_alist = Qnil;
8225
8226   DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8227                doc: /*
8228 Alist to decide a coding system to use for a process I/O operation.
8229 The format is ((PATTERN . VAL) ...),
8230 where PATTERN is a regular expression matching a program name,
8231 VAL is a coding system, a cons of coding systems, or a function symbol.
8232 If VAL is a coding system, it is used for both decoding what received
8233 from the program and encoding what sent to the program.
8234 If VAL is a cons of coding systems, the car part is used for decoding,
8235 and the cdr part is used for encoding.
8236 If VAL is a function symbol, the function must return a coding system
8237 or a cons of coding systems which are used as above.
8238
8239 See also the function `find-operation-coding-system'.  */);
8240   Vprocess_coding_system_alist = Qnil;
8241
8242   DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8243                doc: /*
8244 Alist to decide a coding system to use for a network I/O operation.
8245 The format is ((PATTERN . VAL) ...),
8246 where PATTERN is a regular expression matching a network service name
8247 or is a port number to connect to,
8248 VAL is a coding system, a cons of coding systems, or a function symbol.
8249 If VAL is a coding system, it is used for both decoding what received
8250 from the network stream and encoding what sent to the network stream.
8251 If VAL is a cons of coding systems, the car part is used for decoding,
8252 and the cdr part is used for encoding.
8253 If VAL is a function symbol, the function must return a coding system
8254 or a cons of coding systems which are used as above.
8255
8256 See also the function `find-operation-coding-system'.  */);
8257   Vnetwork_coding_system_alist = Qnil;
8258
8259   DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8260                doc: /* Coding system to use with system messages.
8261 Also used for decoding keyboard input on X Window system.  */);
8262   Vlocale_coding_system = Qnil;
8263
8264   /* The eol mnemonics are reset in startup.el system-dependently.  */
8265   DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8266                doc: /*
8267 *String displayed in mode line for UNIX-like (LF) end-of-line format.  */);
8268   eol_mnemonic_unix = build_string (":");
8269
8270   DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8271                doc: /*
8272 *String displayed in mode line for DOS-like (CRLF) end-of-line format.  */);
8273   eol_mnemonic_dos = build_string ("\\");
8274
8275   DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8276                doc: /*
8277 *String displayed in mode line for MAC-like (CR) end-of-line format.  */);
8278   eol_mnemonic_mac = build_string ("/");
8279
8280   DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8281                doc: /*
8282 *String displayed in mode line when end-of-line format is not yet determined.  */);
8283   eol_mnemonic_undecided = build_string (":");
8284
8285   DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8286                doc: /*
8287 *Non-nil enables character translation while encoding and decoding.  */);
8288   Venable_character_translation = Qt;
8289
8290   DEFVAR_LISP ("standard-translation-table-for-decode",
8291                &Vstandard_translation_table_for_decode,
8292                doc: /* Table for translating characters while decoding.  */);
8293   Vstandard_translation_table_for_decode = Qnil;
8294
8295   DEFVAR_LISP ("standard-translation-table-for-encode",
8296                &Vstandard_translation_table_for_encode,
8297                doc: /* Table for translating characters while encoding.  */);
8298   Vstandard_translation_table_for_encode = Qnil;
8299
8300   DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8301                doc: /* Alist of charsets vs revision numbers.
8302 While encoding, if a charset (car part of an element) is found,
8303 designate it with the escape sequence identifying revision (cdr part
8304 of the element).  */);
8305   Vcharset_revision_table = Qnil;
8306
8307   DEFVAR_LISP ("default-process-coding-system",
8308                &Vdefault_process_coding_system,
8309                doc: /* Cons of coding systems used for process I/O by default.
8310 The car part is used for decoding a process output,
8311 the cdr part is used for encoding a text to be sent to a process.  */);
8312   Vdefault_process_coding_system = Qnil;
8313
8314   DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8315                doc: /*
8316 Table of extra Latin codes in the range 128..159 (inclusive).
8317 This is a vector of length 256.
8318 If Nth element is non-nil, the existence of code N in a file
8319 \(or output of subprocess) doesn't prevent it to be detected as
8320 a coding system of ISO 2022 variant which has a flag
8321 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8322 or reading output of a subprocess.
8323 Only 128th through 159th elements has a meaning.  */);
8324   Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8325
8326   DEFVAR_LISP ("select-safe-coding-system-function",
8327                &Vselect_safe_coding_system_function,
8328                doc: /*
8329 Function to call to select safe coding system for encoding a text.
8330
8331 If set, this function is called to force a user to select a proper
8332 coding system which can encode the text in the case that a default
8333 coding system used in each operation can't encode the text.
8334
8335 The default value is `select-safe-coding-system' (which see).  */);
8336   Vselect_safe_coding_system_function = Qnil;
8337
8338   DEFVAR_BOOL ("inhibit-iso-escape-detection",
8339                &inhibit_iso_escape_detection,
8340                doc: /*
8341 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8342
8343 By default, on reading a file, Emacs tries to detect how the text is
8344 encoded.  This code detection is sensitive to escape sequences.  If
8345 the sequence is valid as ISO2022, the code is determined as one of
8346 the ISO2022 encodings, and the file is decoded by the corresponding
8347 coding system (e.g. `iso-2022-7bit').
8348
8349 However, there may be a case that you want to read escape sequences in
8350 a file as is.  In such a case, you can set this variable to non-nil.
8351 Then, as the code detection ignores any escape sequences, no file is
8352 detected as encoded in some ISO2022 encoding.  The result is that all
8353 escape sequences become visible in a buffer.
8354
8355 The default value is nil, and it is strongly recommended not to change
8356 it.  That is because many Emacs Lisp source files that contain
8357 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8358 in Emacs's distribution, and they won't be decoded correctly on
8359 reading if you suppress escape sequence detection.
8360
8361 The other way to read escape sequences in a file without decoding is
8362 to explicitly specify some coding system that doesn't use ISO2022's
8363 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument].  */);
8364   inhibit_iso_escape_detection = 0;
8365
8366   {
8367     Lisp_Object args[coding_arg_max];
8368     Lisp_Object plist[14];
8369     int i;
8370
8371     for (i = 0; i < coding_arg_max; i++)
8372       args[i] = Qnil;
8373
8374     plist[0] = intern (":name");
8375     plist[1] = args[coding_arg_name] = Qno_conversion;
8376     plist[2] = intern (":mnemonic");
8377     plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8378     plist[4] = intern (":coding-type");
8379     plist[5] = args[coding_arg_coding_type] = Qraw_text;
8380     plist[6] = intern (":ascii-compatible-p");
8381     plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8382     plist[8] = intern (":default-char");
8383     plist[9] = args[coding_arg_default_char] = make_number (0);
8384     plist[10] = intern (":docstring");
8385     plist[11] = build_string ("Do no conversion.\n\
8386 \n\
8387 When you visit a file with this coding, the file is read into a\n\
8388 unibyte buffer as is, thus each byte of a file is treated as a\n\
8389 character.");
8390     plist[12] = intern (":eol-type");
8391     plist[13] = args[coding_arg_eol_type] = Qunix;
8392     args[coding_arg_plist] = Flist (14, plist);
8393     Fdefine_coding_system_internal (coding_arg_max, args);
8394   }
8395
8396   setup_coding_system (Qno_conversion, &keyboard_coding);
8397   setup_coding_system (Qno_conversion, &terminal_coding);
8398   setup_coding_system (Qno_conversion, &safe_terminal_coding);
8399 }
8400
8401 char *
8402 emacs_strerror (error_number)
8403      int error_number;
8404 {
8405   char *str;
8406
8407   synchronize_system_messages_locale ();
8408   str = strerror (error_number);
8409
8410   if (! NILP (Vlocale_coding_system))
8411     {
8412       Lisp_Object dec = code_convert_string_norecord (build_string (str),
8413                                                       Vlocale_coding_system,
8414                                                       0);
8415       str = (char *) XSTRING (dec)->data;
8416     }
8417
8418   return str;
8419 }
8420
8421 #endif /* emacs */